library(DESeq2)
library(RColorBrewer)
library(Biobase)
library(pheatmap)
library(annotables)
library(openxlsx)
library(rio)
library(tidyverse)counts_data<- read.table ("featurecounts.txt", header=T, row.names = 1)head(counts_GSEA)Geneid <chr> | Chr <chr> | ||
|---|---|---|---|
| 1 | ENSG00000223972.5 | chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1 | |
| 2 | ENSG00000227232.5 | chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1 | |
| 3 | ENSG00000278267.1 | chr1 | |
| 4 | ENSG00000243485.5 | chr1;chr1;chr1;chr1;chr1 | |
| 5 | ENSG00000284332.1 | chr1 | |
| 6 | ENSG00000237613.2 | chr1;chr1;chr1;chr1;chr1 |
head(counts_data) #view the counts_data table Chr <chr> | ||
|---|---|---|
| ENSG00000223972.5 | chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1 | |
| ENSG00000227232.5 | chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1 | |
| ENSG00000278267.1 | chr1 | |
| ENSG00000243485.5 | chr1;chr1;chr1;chr1;chr1 | |
| ENSG00000284332.1 | chr1 | |
| ENSG00000237613.2 | chr1;chr1;chr1;chr1;chr1 |
counts_data <- counts_data [ ,6:ncol(counts_data)]
#head(counts_data)colnames(counts_data) <- c("NHEM76","NHEM77","Sbcl2_64","Sbcl2_70","WM1158_68","WM1158_74","WM1366_66","WM1366_72","WM3211_65","WM3211_71","WM793_67","WM793_73","WM9_69","WM9_75")head(counts_data)NHEM76 <int> | NHEM77 <int> | Sbcl2_64 <int> | Sbcl2_70 <int> | WM1158_68 <int> | WM1158_74 <int> | WM1366_66 <int> | WM1366_72 <int> | ||
|---|---|---|---|---|---|---|---|---|---|
| ENSG00000223972.5 | 2 | 0 | 2 | 1 | 2 | 2 | 3 | 5 | |
| ENSG00000227232.5 | 7 | 9 | 10 | 9 | 13 | 5 | 5 | 10 | |
| ENSG00000278267.1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | |
| ENSG00000243485.5 | 0 | 0 | 1 | 1 | 1 | 2 | 0 | 0 | |
| ENSG00000284332.1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
| ENSG00000237613.2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
# Getting columns names and positions
colnames(counts_data) [1] "NHEM76" "NHEM77" "Sbcl2_64" "Sbcl2_70" "WM1158_68"
[6] "WM1158_74" "WM1366_66" "WM1366_72" "WM3211_65" "WM3211_71"
[11] "WM793_67" "WM793_73" "WM9_69" "WM9_75"
#Rearranging by columns positions
counts_data <- counts_data[, c(1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 5, 6)]This step is important, to be able to group them when creating the metadata needed for the DESeq2 object.
head(counts_data)NHEM76 <int> | NHEM77 <int> | Sbcl2_64 <int> | Sbcl2_70 <int> | WM1366_66 <int> | WM1366_72 <int> | WM3211_65 <int> | WM3211_71 <int> | ||
|---|---|---|---|---|---|---|---|---|---|
| ENSG00000223972.5 | 2 | 0 | 2 | 1 | 3 | 5 | 1 | 0 | |
| ENSG00000227232.5 | 7 | 9 | 10 | 9 | 5 | 10 | 7 | 4 | |
| ENSG00000278267.1 | 1 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | |
| ENSG00000243485.5 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | |
| ENSG00000284332.1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
| ENSG00000237613.2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Filtering out the lowly expressed genes and checking the dimension of the data set. We will make it more than 10 because lowly expressed genes are kind of hard to measure for differential expression e.g 1 read or 2 reads.
dim(counts_data) #before[1] 58721 14
counts_data <- counts_data [rowSums(counts_data) > 10, ]
dim(counts_data) #after[1] 26574 14
rows is reduced from 58721 to 26574
For genes (i.e. Ensembl identifiers of the form ENSG*), the version number increments when the set of transcripts linked to a gene changes.
The “dot digit” representing distinct version numbers associated with stable Ensembl identifiers must be removed from the Ensembl gene id.Ensembl Documentation stable IDS.
This is important to be able to match them with grcm38 dataset in “annotables” package.
tmp=gsub("\\..*","",row.names(counts_data))rownames(counts_data) <- tmp
head(counts_data)NHEM76 <int> | NHEM77 <int> | Sbcl2_64 <int> | Sbcl2_70 <int> | WM1366_66 <int> | WM1366_72 <int> | WM3211_65 <int> | WM3211_71 <int> | WM793_67 <int> | ||
|---|---|---|---|---|---|---|---|---|---|---|
| ENSG00000223972 | 2 | 0 | 2 | 1 | 3 | 5 | 1 | 0 | 1 | |
| ENSG00000227232 | 7 | 9 | 10 | 9 | 5 | 10 | 7 | 4 | 4 | |
| ENSG00000241860 | 0 | 2 | 2 | 2 | 2 | 5 | 1 | 1 | 2 | |
| ENSG00000279457 | 4 | 9 | 19 | 13 | 2 | 2 | 2 | 6 | 5 | |
| ENSG00000228463 | 11 | 20 | 1 | 3 | 10 | 5 | 7 | 2 | 6 | |
| ENSG00000237094 | 3 | 2 | 5 | 5 | 7 | 25 | 6 | 11 | 1 |
colData <- data.frame(row.names=colnames(counts_data), group, con)
colDatagroup <fctr> | con <fctr> | |||
|---|---|---|---|---|
| NHEM76 | NHEM | normal | ||
| NHEM77 | NHEM | normal | ||
| Sbcl2_64 | Sbcl2 | radial | ||
| Sbcl2_70 | Sbcl2 | radial | ||
| WM1366_66 | WM1366 | vertical | ||
| WM1366_72 | WM1366 | vertical | ||
| WM3211_65 | WM3211 | vertical | ||
| WM3211_71 | WM3211 | vertical | ||
| WM793_67 | WM793 | vertical | ||
| WM793_73 | WM793 | vertical |
colData variable will be our metadata
colData <- data.frame(row.names=colnames(counts_data), group, con)
colDatagroup <fctr> | con <fctr> | |||
|---|---|---|---|---|
| NHEM76 | NHEM | normal | ||
| NHEM77 | NHEM | normal | ||
| Sbcl2_64 | Sbcl2 | radial | ||
| Sbcl2_70 | Sbcl2 | radial | ||
| WM1366_66 | WM1366 | vertical | ||
| WM1366_72 | WM1366 | vertical | ||
| WM3211_65 | WM3211 | vertical | ||
| WM3211_71 | WM3211 | vertical | ||
| WM793_67 | WM793 | vertical | ||
| WM793_73 | WM793 | vertical |
#Checking whether the row names in colData matches to column names in counts_data
all(colnames(counts_data) %in% rownames(colData))[1] TRUE
# Ensure that the sample names are in the same order in both datasets,
all(colnames(counts_data) == rownames(colData))[1] TRUE
If not, we can modify this with match() function.
mycols <- brewer.pal(11, "Set3")[1:length(unique(group))]As input, the DESeq2 package expects count data as obtained, e.g., from RNA-seq in the form of a matrix of integer values which should be un-normalized counts.
Because the DESeq2 model corrects for library size internally, converted or normalized numbers should not be given as input, such as counts scaled by library size.
dds<- DESeqDataSetFromMatrix (countData= counts_data, colData=colData, design= ~ con)design(dds)~con
design(dds)~con
We now have a DESeq2 object (dds) containing our raw counts and metadata, and we want to do quality control on our samples. As a result, the normalized counts must be generated (normalized for library size, which is the total number of gene counts per sample, while accounting for library composition).
To evaluate the sample-level quality control matrix, the first step in the process is to normalize the raw counts.
what is meant by normalized raw counts ?
The raw counts represent the number of reads aligning to each gene and should be proportional to the RNA expression in the sample; however, there are additional factors that can influence the number of genes aligning to each gene besides RNA expression.
Using normalization methods, we may alter the count data to
remove the influence of these factors and remove the overall counts.
Library depth, gene length, and RNA composition are three of the most important parameters to consider when normalizing count data.
In terms of gene length, a longer gene results in a longer transcript, which results in more fragments for sequencing. As a result, a longer gene with the same amount of expression will have more counts than a shorter gene. Therefore, when comparing the expression levels of various genes, the lengths of the genes must be taken into account.
When adjusting for library size, the library’s composition is also crucial to consider. Many normalization methods that are not resistant to outliers can be skewed by a few highly expressed genes.
Normalization for the majority of genes would be skewed by the highly expressed DE gene if we only divided our counts by the total number of reads. As a result, we need to use a technique that is resistant to these outlier genes when performing a DE analysis.
DESeq2 normalizes using the “median of ratios” method. which accounts for library size when computing raw counts and is resistant to large numbers of differentially expressed genes.
For normalization, the raw counts of each sample will be divided by the sample-specific size factor.
We will create a data table with read counts normalized by library size
Because we have different library sizes for the same number of genes, we need to make a ratio between them, therefore we’ll multiply every position by this size factor, which we can achieve by estimating size factors.
dds <- estimateSizeFactors(dds)
sizeFactors(dds)# to view the size factors used for normalization NHEM76 NHEM77 Sbcl2_64 Sbcl2_70 WM1366_66 WM1366_72 WM3211_65
1.1675968 1.7287595 1.1541503 1.2013835 0.6304523 1.2699113 0.7400715
WM3211_71 WM793_67 WM793_73 WM9_69 WM9_75 WM1158_68 WM1158_74
0.8725916 1.0196053 1.0923370 1.5843539 0.4016549 1.4756025 0.8078638
colData(dds)DataFrame with 14 rows and 3 columns
group con sizeFactor
<factor> <factor> <numeric>
NHEM76 NHEM normal 1.167597
NHEM77 NHEM normal 1.728759
Sbcl2_64 Sbcl2 radial 1.154150
Sbcl2_70 Sbcl2 radial 1.201383
WM1366_66 WM1366 vertical 0.630452
... ... ... ...
WM793_73 WM793 vertical 1.092337
WM9_69 WM9 metastasis 1.584354
WM9_75 WM9 metastasis 0.401655
WM1158_68 WM1158 metastasis 1.475603
WM1158_74 WM1158 metastasis 0.807864
head(counts(dds)) #un-normalized NHEM76 NHEM77 Sbcl2_64 Sbcl2_70 WM1366_66 WM1366_72
ENSG00000223972 2 0 2 1 3 5
ENSG00000227232 7 9 10 9 5 10
ENSG00000241860 0 2 2 2 2 5
ENSG00000279457 4 9 19 13 2 2
ENSG00000228463 11 20 1 3 10 5
ENSG00000237094 3 2 5 5 7 25
WM3211_65 WM3211_71 WM793_67 WM793_73 WM9_69 WM9_75
ENSG00000223972 1 0 1 1 0 2
ENSG00000227232 7 4 4 5 8 5
ENSG00000241860 1 1 2 2 4 0
ENSG00000279457 2 6 5 4 12 2
ENSG00000228463 7 2 6 2 7 3
ENSG00000237094 6 11 1 4 16 6
WM1158_68 WM1158_74
ENSG00000223972 2 2
ENSG00000227232 13 5
ENSG00000241860 9 1
ENSG00000279457 7 3
ENSG00000228463 2 0
ENSG00000237094 20 15
Now we have the size factors been calculated, and added to the DESeq2 object, the normalized counts can be extracted from it using the counts() function.
norm_counts<- counts(dds, normalized = TRUE)
head(norm_counts) NHEM76 NHEM77 Sbcl2_64 Sbcl2_70 WM1366_66
ENSG00000223972 1.71292 0.000000 1.7328766 0.8323737 4.758488
ENSG00000227232 5.99522 5.206045 8.6643831 7.4913633 7.930814
ENSG00000241860 0.00000 1.156899 1.7328766 1.6647474 3.172326
ENSG00000279457 3.42584 5.206045 16.4623279 10.8208581 3.172326
ENSG00000228463 9.42106 11.568989 0.8664383 2.4971211 15.861628
ENSG00000237094 2.56938 1.156899 4.3321916 4.1618685 11.103139
WM1366_72 WM3211_65 WM3211_71 WM793_67 WM793_73
ENSG00000223972 3.937283 1.351221 0.000000 0.9807716 0.9154684
ENSG00000227232 7.874565 9.458546 4.584046 3.9230866 4.5773421
ENSG00000241860 3.937283 1.351221 1.146011 1.9615433 1.8309368
ENSG00000279457 1.574913 2.702442 6.876069 4.9038582 3.6618737
ENSG00000228463 3.937283 9.458546 2.292023 5.8846298 1.8309368
ENSG00000237094 19.686414 8.107325 12.606126 0.9807716 3.6618737
WM9_69 WM9_75 WM1158_68 WM1158_74
ENSG00000223972 0.000000 4.979399 1.355379 2.475665
ENSG00000227232 5.049377 12.448497 8.809960 6.189162
ENSG00000241860 2.524688 0.000000 6.099203 1.237832
ENSG00000279457 7.574065 4.979399 4.743825 3.713497
ENSG00000228463 4.418205 7.469098 1.355379 0.000000
ENSG00000237094 10.098754 14.938196 13.553785 18.567486
sum(is.na(norm_counts))[1] 0
gsea_file <- read_delim("ddsNormSF.txt", "\t", escape_double = FALSE, trim_ws = TRUE)New names:
* `` -> `...1`
Rows: 26574 Columns: 15
-- Column specification ----------------------------------------------
Delimiter: "\t"
chr (1): ...1
dbl (14): NHEM76, NHEM77, Sbcl2_64, Sbcl2_70, WM1366_66, WM1366_72...
i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(gsea_file)...1 <chr> | NHEM76 <dbl> | NHEM77 <dbl> | Sbcl2_64 <dbl> | Sbcl2_70 <dbl> | WM1366_66 <dbl> | WM1366_72 <dbl> | WM3211_65 <dbl> | WM3211_71 <dbl> | |
|---|---|---|---|---|---|---|---|---|---|
| ENSG00000223972 | 1.71292 | 0.000000 | 1.7328766 | 0.8323737 | 4.758488 | 3.937283 | 1.351221 | 0.000000 | |
| ENSG00000227232 | 5.99522 | 5.206045 | 8.6643831 | 7.4913633 | 7.930814 | 7.874565 | 9.458546 | 4.584046 | |
| ENSG00000241860 | 0.00000 | 1.156899 | 1.7328766 | 1.6647474 | 3.172326 | 3.937283 | 1.351221 | 1.146011 | |
| ENSG00000279457 | 3.42584 | 5.206045 | 16.4623279 | 10.8208581 | 3.172326 | 1.574913 | 2.702442 | 6.876069 | |
| ENSG00000228463 | 9.42106 | 11.568989 | 0.8664383 | 2.4971211 | 15.861628 | 3.937283 | 9.458546 | 2.292023 | |
| ENSG00000237094 | 2.56938 | 1.156899 | 4.3321916 | 4.1618685 | 11.103139 | 19.686414 | 8.107325 | 12.606126 |
write.table(gsea_file, file = "gsea_inputfile.txt", quote = FALSE, row.names = FALSE, sep = "\t")#test12 <- read.delim("ddsNormSF2.txt")
#head(test12)
#read_delim("file.txt", "\t", escape_double = FALSE, trim_ws = TRUE)
#write.table(df, file = "df.txt", quote = FALSE, row.names = FALSE, sep = "\t")
#save the normalized counts in a table
#write.table(norm_counts, file = "C:/Users/melbe/Documents/R/R_Projects/Diff_Exp_thesis/ddsNormSF2.txt", sep = "\t", col.names = NA)
Now that we’ve normalized the counts, we can move on to the next step of our analysis. We can now compare the counts between the different samples because the counts have been normalized for library size.
To measure the quality of our experiment, we can look at how the samples relate in terms of gene expression.
Visualization approaches for unsupervised clustering analyses, such as hierarchical heatmaps and principal component analysis PCA, are used to do this.
These QC approaches are used to determine how comparable the biological replicates are to one another, as well as to detect outlier samples and major sources of variation in the data set.
To better the visualization of the clustering, we should first use the log to transform the normalized counts before using these Visualization methods. DESeq2 applies a variance stabilizing transformation (VST) to RNA-Seq data, which is a logarithmic transformation that reduces variance across the mean.
The DESeq2 vst() function on the DESeq2 object can be used to transform the normalized counts. The blind = True argument indicates that the transformation should be blind to the sample information provided in the design formula; this argument should be stated during the quality evaluation.
vsd <- vst(dds, blind = TRUE)To assess the similarity and gene expression between different samples in a dataset, hierarchical clustering with heatmaps is used.
This method is used to determine how similar replicates are to one another and whether samples from various sample groups cluster together or separately. The heatmap is made by combining the gene expression correlation values for all pairwise combinations of samples in the data set, with 1 being the perfect correlation.
The heatmap’s colors reflect the correlation values, while the hierarchical tree shows which samples are more similar to one another. The biological replicates should be grouped together, whereas the sample conditions should be separated. Because the majority of genes should not be differentially expressed, samples should have a high correlation.
Samples having correlation values less than 0.8 may need to be investigated further to see if they are outliers or have contamination.
To make a heatmap, we’ll use the assay() function to extract the VST-transformed normalized counts as a matrix from a vsd object. The pairwise correlation values between each pair of samples can then be computed using the cor() function.
vsd_mat <- assay(vsd) vsd_cor <- cor(vsd_mat)pheatmap(vsd_cor, annotation = dplyr::select(colData, con))The biological replicates are clustered together, while the samples from various conditions are clustered separately. There are no outliers or samples with low correlation values when compared to the rest of the data.
In order to continue evaluating the quality of our samples, we will use PCA to see how our samples cluster and whether our condition of interest corresponds to the principal components explaining the most variation in the data.
PCA is a technique for emphasizing the variation in a dataset. The first principal component, or PCA1, represents the greatest amount of variance in the data.
we could plot the normalized counts of every gene for one sample in the x-axis and the other sample on the y-axis. PC2, the dataset’s second most variation, must be perpendicular to PC1 in order to best describe the variance in the dataset not included in PC1. PC2 has a much smaller spread.
The number of principal components in the dataset is equal to the number of samples, n. PC1 means plotting a line through n-dimensional space to find the greatest amount of variation.
The principal component with the most variant genes has the greatest influence on the direction of that principal component. Genes are given quantitative scores based on how much they influence the various PCs. The product of the influence and the normalized read counts for each gene is multiplied by all genes to get a ‘per sample’ PC value.We usually plot these per-sample PC values for PCA.
The gene expression profiles of samples that cluster together are more similar than those of samples that cluster apart, especially for the most variant genes. As we hope to see replicated clusters together and conditions to separate on PC1, this is a good method to explore the quality of the data.
This method can also be used to identify sample outliers and major sources of variation.
plotPCA(vsd, intgroup = 'con')For the PCA plot generated in the previous part, the output regarding the quality of the samples shows that The biological replicates tend to cluster together. The samples separate by condition on PC1.
Estimated size factors for each gene, as well as the variation in expression across replicates were evaluated. The data can then be fitted to the negative binomial model using these calculations.
A DESeq2 object containing the raw data, metadata, and design formula has already been created. The design formula tells DESeq2 which known major source of variation to control, for, or regress out, as well as which condition of interest to use for differential expression testing.
#run the analysis
dds <- DESeq(dds)using pre-existing size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
The final DESeq2 object will have all of the data required to perform differential expression analysis between different sample groups.
resultsNames(dds)[1] "Intercept" "con_normal_vs_metastasis"
[3] "con_radial_vs_metastasis" "con_vertical_vs_metastasis"
???????????
dds$con <- relevel(dds$con, ref = "normal")#run the analysis again
dds <- DESeq(dds)using pre-existing size factors
estimating dispersions
found already estimated dispersions, replacing these
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
resultsNames(dds)[1] "Intercept" "con_metastasis_vs_normal"
[3] "con_radial_vs_normal" "con_vertical_vs_normal"
???????????
The raw counts are modeled using the dispersion estimates; we should investigate how the raw data will match the model.
The purpose of DE analysis is to see if the mean expression of a gene differs between sample groups when there is variation within groups. This is accomplished by determining whether the difference in log2 fold changes between groups is significantly different from zero.
The log2 fold changes are calculated by dividing the mean of one sample group by the mean of the other sample group. As a result, information regarding the mean and variation in the data is required to model the counts.
We’ll look at the variance in gene expression relative to the mean to see how our data varies.
Variance is the square of the standard deviation, representing how far away the expression of the individual samples, are from the means.
*The variance is expected to increase with the gene’s mean expression in RNA-Seq data. To observe this relationship, we can use the apply() function to calculate the means and variances for each gene in the normal samples. Then, using ggplot2, we create a data frame for plotting. The log10 scales can be used to plot the mean and variance for each gene.
we can plot the mean and variance for each gene using the log10 scales.
In the DESeq2 model, a metric called dispersion describes a measure of variance for a given mean to assess the variability in expression.
Dispersion formula: Var = μ + α*μ^2 .
plotDispEsts(dds)Each black dot represents a gene, with associated mean and dispersion values. With RNA seq data, the variance of gene expression increases as the mean decreases, which is to be expected. Also, notice how the variance range for lower mean counts is wider than for higher mean counts.
As the mean increases, the dispersion values decrease. The increase in variance, on the other hand, increases dispersion.
Because gene-wise estimates of dispersion are often misleading in RNA-seq experiments with only a few replicates, DESeq2 uses information from all genes to determine the most likely estimates of dispersion for a given mean expression value, as shown by the red line in the figure.
Genes with inaccurately tiny estimates of variation could return many false postives, or genes classified as DE, when they are really not. Therefore, the original gene-wise dispersion estimates, shown as the black dots in the figure, are shrunken towards the curve to yield more accurate estimates of dispersion.
For identifying the differentially expressed genes, the more accurate, shrunken dispersion estimates are applied to model the counts.
*Extremely high dispersion values, encircled by blue circles, aren’t shrunk since there’s a chance the gene has more variability than others for biological or technical reasons, and lowering the variability could lead to false positives.
we will try to find the significant genes during the development of cancer cells from melanocytes to the Radial growth phase.
Now that we have explored the fit of our data to the model, we can extract the DE testing model.
We can also get more precise foldchange estimates, which measure the expression of one sample group compared to another. The Wald test is used by DESeq2 to compare two sample groups for the condition of interest, in this case “normal and radial growth Phase.”
#sadad
dds1_res <- results(dds, contrast = c("con", "radial", "normal"))
dds1_reslog2 fold change (MLE): con radial vs normal
Wald test p-value: con radial vs normal
DataFrame with 26574 rows and 6 columns
baseMean log2FoldChange lfcSE stat
<numeric> <numeric> <numeric> <numeric>
ENSG00000223972 1.78799 0.710301 1.992192 0.356542
ENSG00000227232 7.01446 0.536886 0.733066 0.732383
ENSG00000241860 1.98683 1.378421 1.664696 0.828031
ENSG00000279457 5.70124 1.632679 0.796839 2.048945
ENSG00000228463 5.49010 -2.638075 1.296266 -2.035134
... ... ... ... ...
ENSG00000198695 399.68416 -1.588101 0.476504 -3.332819
ENSG00000210194 2.94970 -1.894507 1.723122 -1.099462
ENSG00000198727 1411.29417 -1.208026 0.401252 -3.010640
ENSG00000210195 9.21425 -1.048826 0.793127 -1.322394
ENSG00000210196 58.66427 -0.208716 0.492520 -0.423773
pvalue padj
<numeric> <numeric>
ENSG00000223972 0.7214345 0.851775
ENSG00000227232 0.4639346 0.663017
ENSG00000241860 0.4076527 0.612446
ENSG00000279457 0.0404674 0.130239
ENSG00000228463 0.0418374 0.133405
... ... ...
ENSG00000198695 0.000859709 0.0067447
ENSG00000210194 0.271566402 0.4737640
ENSG00000198727 0.002606980 0.0164832
ENSG00000210195 0.186036892 0.3711853
ENSG00000210196 0.671731428 0.8178945
log2 fold change : condition radial vs normal, stating that normal is the base level of comparison. Which means that all log2fold changes represent the radial group relative to the normal group.
For all genes analyzed, the MA plot depicts the mean of normalized counts vs log2foldchanges.
plotMA(dds, ylim=c(-20,20))for genes with low amount of data available, shrinkage utilizes information from all genes to provide more likely, lower, log2 fold change estimates, similar to what we performed with dispersion.
dds1_LFC <- lfcShrink(dds,
coef = "con_radial_vs_normal" ,
res = dds1_res)using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
plotMA(dds1_LFC, ylim=c(-20,20))
The log2fold change values are now more restricted, especially for lowly expressed genes.
The shrunken log2foldchanges should be more precise; however, shrinking the log2 foldchanges will not affect the number of differentially expressed genes returned, only the log2 fold change values. Now we can retrieve the significant DE genes and perform further visualization of results.
mcols(dds1_LFC)DataFrame with 5 rows and 2 columns
type description
<character> <character>
baseMean intermediate mean of normalized c..
log2FoldChange results log2 fold change (MA..
lfcSE results posterior SD: con ra..
pvalue results Wald test p-value: c..
padj results BH adjusted p-values
We will use the p-value adjusted for the multiple test correction. Because, every gene tested with alpha equals to 0.05, meanst that there is a 5% chance that the gene is called as a DE when it is not, returning false positives. For this reason, multiple test correction is performed by DESeq2 using Benjamin-Hochberg, or BH-method, to adjust the p-values for multiple testing and control the proportion of false postives relative to true.
If we had 1000 genes categorized as DE and used the BH-method with an alpha value of 0.05, we would expect 5% of the DE genes to be false positives or 50 genes.
Prior to testing, DESeq2 automatically filters out genes that are unlikely to be actually differentially expressed, such as genes with zero counts across all samples, genes with low mean values across all samples, and genes with dramatic count outlines, to limit the number of genes tested.
head(dds1_LFC,n =10 )log2 fold change (MAP): con radial vs normal
Wald test p-value: con radial vs normal
DataFrame with 10 rows and 5 columns
baseMean log2FoldChange lfcSE pvalue
<numeric> <numeric> <numeric> <numeric>
ENSG00000223972 1.78799 0.132969 0.885471 0.721434541
ENSG00000227232 7.01446 0.350785 0.604942 0.463934557
ENSG00000241860 1.98683 0.381369 0.901971 0.407652728
ENSG00000279457 5.70124 1.183497 0.769140 0.040467446
ENSG00000228463 5.49010 -1.436076 1.310678 0.041837426
ENSG00000237094 8.96601 0.562235 0.843891 0.280379678
ENSG00000225972 5.58042 0.564300 0.788241 0.286326887
ENSG00000225630 38.66330 -2.618253 0.785182 0.000084522
ENSG00000237973 141.47226 -0.136626 0.572538 0.763457237
ENSG00000229344 94.37507 0.194176 0.689321 0.692876738
padj
<numeric>
ENSG00000223972 0.851775139
ENSG00000227232 0.663016611
ENSG00000241860 0.612445520
ENSG00000279457 0.130239006
ENSG00000228463 0.133405074
ENSG00000237094 0.483646137
ENSG00000225972 0.490378302
ENSG00000225630 0.000990717
ENSG00000237973 0.880374602
ENSG00000229344 0.832377164
The filtered genes are denoted by a NA in the p-adjusted column in the results table.
summary(dds1_LFC)
out of 26574 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 3063, 12%
LFC < 0 (down) : 2972, 11%
outliers [1] : 7, 0.026%
low counts [2] : 5152, 19%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Over 1000 genes are DE which is the sum of log fold changes less than zero and greater than zero.
summary(dds1_LFC, alpha = 0.05)
out of 26574 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 2308, 8.7%
LFC < 0 (down) : 2401, 9%
outliers [1] : 7, 0.026%
low counts [2] : 5152, 19%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
A log2fold change threshold could be used to return genes that are most likely to be biologically meaningful. A log2fold change threshold isn’t always the best choice. It can, however, be useful when dealing with huge numbers of DE genes.
#Rerun results function, Using 1.25 foldchange threshold which is 0.32 in the log2 scale
#dds1_res <- results(dds,
# contrast = c("con", "radial", "normal"),
# alpha = 0.05,
# lfcThreshold = 0.32)
#head(dds1_res)While using any log2fold change cut-off raises the risk of losing biologically relevant genes, by using a very small log2 fold change threshold, we are aiming to reduce the risk that the more biologically relevant.
resultsNames(dds)[1] "Intercept" "con_metastasis_vs_normal"
[3] "con_radial_vs_normal" "con_vertical_vs_normal"
#Reshrink the foldchanges
dds1_LFC <- lfcShrink(dds,
coef = "con_radial_vs_normal",
res = dds1_res)using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
# Turn results table into a data frame
dds1_res_df <- data.frame(dds1_LFC)
head (dds1_res_df)baseMean <dbl> | log2FoldChange <dbl> | lfcSE <dbl> | pvalue <dbl> | padj <dbl> | |
|---|---|---|---|---|---|
| ENSG00000223972 | 1.787989 | 0.1329691 | 0.8854707 | 0.72143454 | 0.8517751 |
| ENSG00000227232 | 7.014458 | 0.3507850 | 0.6049423 | 0.46393456 | 0.6630166 |
| ENSG00000241860 | 1.986826 | 0.3813693 | 0.9019711 | 0.40765273 | 0.6124455 |
| ENSG00000279457 | 5.701238 | 1.1834972 | 0.7691399 | 0.04046745 | 0.1302390 |
| ENSG00000228463 | 5.490095 | -1.4360759 | 1.3106781 | 0.04183743 | 0.1334051 |
| ENSG00000237094 | 8.966015 | 0.5622349 | 0.8438907 | 0.28037968 | 0.4836461 |
dds1_GSEA <- dds1_res_df %>% arrange(padj)#write.table(dds1_res, file = "DE_VGP_MET.txt", sep = " ", col.names = NA)#grcm38
dds1_res_anno <- rownames_to_column(dds1_res_df, var = "ensgene")
head(dds1_res_anno)ensgene <chr> | baseMean <dbl> | log2FoldChange <dbl> | lfcSE <dbl> | pvalue <dbl> | padj <dbl> | |
|---|---|---|---|---|---|---|
| 1 | ENSG00000223972 | 1.787989 | 0.1329691 | 0.8854707 | 0.72143454 | 0.8517751 |
| 2 | ENSG00000227232 | 7.014458 | 0.3507850 | 0.6049423 | 0.46393456 | 0.6630166 |
| 3 | ENSG00000241860 | 1.986826 | 0.3813693 | 0.9019711 | 0.40765273 | 0.6124455 |
| 4 | ENSG00000279457 | 5.701238 | 1.1834972 | 0.7691399 | 0.04046745 | 0.1302390 |
| 5 | ENSG00000228463 | 5.490095 | -1.4360759 | 1.3106781 | 0.04183743 | 0.1334051 |
| 6 | ENSG00000237094 | 8.966015 | 0.5622349 | 0.8438907 | 0.28037968 | 0.4836461 |
hm_annotables <- grch38[, c( "ensgene", "symbol", "description")]
dds_test <- merge(dds1_res_anno, hm_annotables, by.x = "ensgene", by.y = "ensgene", all.x = T )#dds_test <- dds_test%>% dplyr::select(-ends_with(".y"))head(dds_test) ensgene <chr> | baseMean <dbl> | log2FoldChange <dbl> | lfcSE <dbl> | pvalue <dbl> | padj <dbl> | symbol <chr> | ||
|---|---|---|---|---|---|---|---|---|
| 1 | ENSG00000000003 | 571.66860 | 0.87041284 | 0.4815263 | 3.874260e-02 | 0.125803292 | TSPAN6 | |
| 2 | ENSG00000000419 | 1748.17647 | 0.52449759 | 0.4018190 | 1.519862e-01 | 0.325413268 | DPM1 | |
| 3 | ENSG00000000457 | 180.21557 | 1.43813743 | 0.3818214 | 5.129922e-05 | 0.000650043 | SCYL3 | |
| 4 | ENSG00000000460 | 534.67620 | 0.08498325 | 0.2994136 | 7.698350e-01 | 0.883638824 | C1orf112 | |
| 5 | ENSG00000000971 | 13.81695 | 1.25149478 | 1.0308212 | 4.970123e-02 | 0.151470901 | CFH | |
| 6 | ENSG00000001036 | 444.54055 | -0.13173487 | 0.2296575 | 5.518713e-01 | 0.734103036 | FUCA2 |
NAhead(grch38)ensgene <chr> | entrez <int> | symbol <chr> | chr <chr> | start <int> | end <int> | strand <int> | biotype <chr> | |
|---|---|---|---|---|---|---|---|---|
| ENSG00000000003 | 7105 | TSPAN6 | X | 100627108 | 100639991 | -1 | protein_coding | |
| ENSG00000000005 | 64102 | TNMD | X | 100584936 | 100599885 | 1 | protein_coding | |
| ENSG00000000419 | 8813 | DPM1 | 20 | 50934867 | 50959140 | -1 | protein_coding | |
| ENSG00000000457 | 57147 | SCYL3 | 1 | 169849631 | 169894267 | -1 | protein_coding | |
| ENSG00000000460 | 55732 | C1orf112 | 1 | 169662007 | 169854080 | 1 | protein_coding | |
| ENSG00000000938 | 2268 | FGR | 1 | 27612064 | 27635185 | -1 | protein_coding |
dds1_res_anno <- left_join(x = dds1_res_anno,
y = grch38[ , c("ensgene", "symbol", "description")],
by = c("ensgene"))#non_duplicates <- which(duplicated(ids$symbol) == FALSE)
head(dds1_res_anno)ensgene <chr> | baseMean <dbl> | log2FoldChange <dbl> | lfcSE <dbl> | pvalue <dbl> | padj <dbl> | symbol <chr> | ||
|---|---|---|---|---|---|---|---|---|
| 1 | ENSG00000223972 | 1.787989 | 0.1329691 | 0.8854707 | 0.7214345 | 0.8517751 | DDX11L1 | |
| 2 | ENSG00000223972 | 1.787989 | 0.1329691 | 0.8854707 | 0.7214345 | 0.8517751 | DDX11L1 | |
| 3 | ENSG00000223972 | 1.787989 | 0.1329691 | 0.8854707 | 0.7214345 | 0.8517751 | DDX11L1 | |
| 4 | ENSG00000223972 | 1.787989 | 0.1329691 | 0.8854707 | 0.7214345 | 0.8517751 | DDX11L1 | |
| 5 | ENSG00000223972 | 1.787989 | 0.1329691 | 0.8854707 | 0.7214345 | 0.8517751 | DDX11L1 | |
| 6 | ENSG00000227232 | 7.014458 | 0.3507850 | 0.6049423 | 0.4639346 | 0.6630166 | WASH7P |
head(dds1_LFC)log2 fold change (MAP): con radial vs normal
Wald test p-value: con radial vs normal
DataFrame with 6 rows and 5 columns
baseMean log2FoldChange lfcSE pvalue
<numeric> <numeric> <numeric> <numeric>
ENSG00000223972 1.78799 0.132969 0.885471 0.7214345
ENSG00000227232 7.01446 0.350785 0.604942 0.4639346
ENSG00000241860 1.98683 0.381369 0.901971 0.4076527
ENSG00000279457 5.70124 1.183497 0.769140 0.0404674
ENSG00000228463 5.49010 -1.436076 1.310678 0.0418374
ENSG00000237094 8.96601 0.562235 0.843891 0.2803797
padj
<numeric>
ENSG00000223972 0.851775
ENSG00000227232 0.663017
ENSG00000241860 0.612446
ENSG00000279457 0.130239
ENSG00000228463 0.133405
ENSG00000237094 0.483646
head(grch38)ensgene <chr> | entrez <int> | symbol <chr> | chr <chr> | start <int> | end <int> | strand <int> | biotype <chr> | |
|---|---|---|---|---|---|---|---|---|
| ENSG00000000003 | 7105 | TSPAN6 | X | 100627108 | 100639991 | -1 | protein_coding | |
| ENSG00000000005 | 64102 | TNMD | X | 100584936 | 100599885 | 1 | protein_coding | |
| ENSG00000000419 | 8813 | DPM1 | 20 | 50934867 | 50959140 | -1 | protein_coding | |
| ENSG00000000457 | 57147 | SCYL3 | 1 | 169849631 | 169894267 | -1 | protein_coding | |
| ENSG00000000460 | 55732 | C1orf112 | 1 | 169662007 | 169854080 | 1 | protein_coding | |
| ENSG00000000938 | 2268 | FGR | 1 | 27612064 | 27635185 | -1 | protein_coding |
dds1_sig <- subset(dds1_res_anno, padj < 0.05)dds1_sig <- dds1_sig %>% arrange(padj)View(dds1_sig)dds1_first20 <- dds1_sig[1:20,]
View(dds1_first20)
write.csv(dds1_first20, "dds1_first20.csv", row.names = F)# Subset normalized counts to significant genes
sig1_norm_counts <- norm_counts[dds1_sig$ensgene,]
# Choose a color palette from RColorBrewerlibrary(RColorBrewer)
heat_colors <- brewer.pal(6,"YlOrRd")
display.brewer.all()
# Run pheatmap
pheatmap(sig1_norm_counts[1:30,],
color = heat_colors,
cluster_rows =F,
show_rownames =T,
annotation = dplyr::select(colData, con),
scale ="row")
???????????????????????????????????????? #### Rlog
Transformation for clustering and heatmaps
rld<- rlogTransformation(dds)
head(assay(rld)) NHEM76 NHEM77 Sbcl2_64 Sbcl2_70 WM1366_66
ENSG00000223972 0.59199618 -0.2475137 0.5982058 0.2624646 1.238528
ENSG00000227232 2.65519006 2.5390191 2.9659902 2.8402982 2.883771
ENSG00000241860 0.07606271 0.6071246 0.8101048 0.7889924 1.150871
ENSG00000279457 2.04768637 2.3607590 3.3946150 2.9942330 2.018953
ENSG00000228463 2.76019786 2.9639344 1.1767126 1.6998418 3.235212
ENSG00000237094 2.05828030 1.5864318 2.4208884 2.3893842 3.215220
WM1366_72 WM3211_65 WM3211_71 WM793_67 WM793_73
ENSG00000223972 1.142079 0.4705563 -0.1237509 0.3304930 0.3015933
ENSG00000227232 2.883422 3.0371637 2.4531576 2.3373055 2.4466286
ENSG00000241860 1.332886 0.6969742 0.6241054 0.8766034 0.8394052
ENSG00000279457 1.577442 1.9135689 2.5849665 2.3133343 2.0960224
ENSG00000228463 2.019581 2.7474891 1.6624998 2.3398594 1.5198851
ENSG00000237094 3.791069 2.9359794 3.3425277 1.6105970 2.2993169
WM9_69 WM9_75 WM1158_68 WM1158_74
ENSG00000223972 -0.2332217 1.2210628 0.4700582 0.7969234
ENSG00000227232 2.5157238 3.2636258 2.9828250 2.6826526
ENSG00000241860 1.0332529 0.3052228 1.6796610 0.6579017
ENSG00000279457 2.6768982 2.3263161 2.2865932 2.1127464
ENSG00000228463 2.1095090 2.5062586 1.3374956 0.8738203
ENSG00000237094 3.1422104 3.4776052 3.4220916 3.7207741
hist(assay(rld))
sampleDists <- as.matrix(dist(t(assay(rld))))
plotPCA(rld, intgroup="con")plotPCA(vsd, intgroup = 'con')
we (select) the genes for the heatmap in (order) and this order is selected by (rowMeans) and select the (count) of my deseq data set (normalized) to library size and we want (decreasing) to have info about all upregulated and down regulated genes and set the number to present or to visualize [1:50] then we have to transform my normalized count and we select log2.norm.counts which i will use in the heatmap and then tell the heatmap that the annotation_col is the df (annotation_col=df)
#rowmeans is wrong ????
select1 <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE) [1:50]
nt <- normTransform(dds) # defaults to log2(x+1)
log2.norm.counts <- assay(nt)[select1,]
df <- as.data.frame(colData(dds)[,c("group","con")])
pheatmap(log2.norm.counts, cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=FALSE, annotation_col=df, fontsize_row = 5)
From the heatmap we can Know the expression level of genes: Dark blue indicates very low expressed genes light blue indicates low expressed genes Red indicates very highly expressed genes
df <- as.data.frame(colData(rld)[,c("con","group")])
pheatmap(assay(rld)[select1,], cluster_rows=T, show_rownames=TRUE,
cluster_cols=T, annotation_col=df, fontsize_row =8)top1_20 <- data.frame(sig1_norm_counts)[1:20,] %>% rownames_to_column(var ="ensgene")
#key column will be a seperate column
top1_20 <- gather(top1_20, key ="samplename", value ="normalized_counts",2:4)#head(sig1_norm_counts)
top1_20 <- inner_join(top1_20, rownames_to_column(colData, var ="samplename"), by ="samplename")ggplot(top1_20)+
geom_point(aes(x = ensgene, y = normalized_counts, color = con))+ scale_y_log10()+
xlab("Genes")+
ylab("Normalized Counts")+
ggtitle("Top 20 Significant DE Genes")+
theme_bw()+
theme(axis.text.x = element_text(angle =45, hjust =1))+
theme(plot.title = element_text(hjust =0.5))
#good idea is to make symbol as rownames
#top1_20_symbol <- data.frame(sig1_norm_counts)[1:20,] %>% rownames_to_column(var ="symbol")
#top1_20_symbol <- gather(top1_20, key ="samplename", value ="normalized_counts",2:4)#top1_20_symbol <- data.frame(sig1_norm_symbol)[1:60,] %>% rownames_to_column(var ="symbol")
#top1_20_symbol <- gather(top1_20_symbol, key ="samplename", value ="normalized_counts",2:4)#top1_20_symbol <- inner_join(top1_20_symbol, rownames_to_column(colData, var ="samplename"), by ="samplename")#head(top1_20_smybol)# adding symbol to norm counts
#sig1_norm_symbol <- rownames_to_column(dds1_res_df, var = "ensgene")
#pick symbol from gr38
#sig1_norm_symbol <- left_join(x = sig1_norm_symbol,
# y = grch38[ , c("ensgene", "symbol")],
# by = c("ensgene"))
#
#remove duplicates
#sig1_norm_symbol <- unique(sig1_norm_symbol)
#Order by p-adjusted Value
#sig1_norm_symbol <- sig1_norm_symbol %>% arrange(padj)
#get 1st 50
#sig1_norm_symbol <- sig1_norm_symbol[1:70,]
#change symbol to be rownames insted of ensembl ids
#row.names(sig1_norm_symbol) <- sig1_norm_symbol$symbol
#remove last column
#sig1_norm_symbol <- sig1_norm_symbol[,-7]
#another try
## Subset normalized counts to significant genes
sig1_norm_symbol <- norm_counts[dds1_sig$ensgene,]
#apply this to norm counts
sig1_norm_symbol <- rownames_to_column(as.data.frame(sig1_norm_symbol), var = "ensgene")
#pick symbol from gr38
sig1_norm_symbol <- left_join(x = sig1_norm_symbol,
y = grch38[ , c("ensgene", "symbol")],
by = c("ensgene"))
#remove duplicates
sig1_norm_symbol <- unique(sig1_norm_symbol)top1_20_symbol <- data.frame(sig1_norm_symbol)[1:20,] %>% rownames_to_column(var ="ids")
#remove ids column
top1_20_symbol <- top1_20_symbol[,-1]
#key column will be a seperate column
top1_20_symbol <- gather(top1_20_symbol, key ="samplename", value ="normalized_counts",2:4)
top1_20_symbol <- inner_join(top1_20_symbol, rownames_to_column(colData, var ="samplename"), by ="samplename")ggplot(top1_20_symbol)+
geom_point(aes(x = symbol, y = normalized_counts, color = con))+ scale_y_log10()+
xlab("Genes")+
ylab("Normalized Counts")+
ggtitle("Top 20 Significant DE Genes")+
theme_bw()+
theme(axis.text.x = element_text(angle =45, hjust =1))+
theme(plot.title = element_text(hjust =0.5))
# Run pheatmap
pheatmap(sig1_norm_counts[1:30,],
color = heat_colors,
cluster_rows =F,
show_rownames =T,
annotation = dplyr::select(colData, con),
scale ="row")head(sig1_norm_counts) NHEM76 NHEM77 Sbcl2_64 Sbcl2_70 WM1366_66
ENSG00000136040 4982.02798 4788.983166 49.38698 53.27192 25.3786
ENSG00000123374 4500.69744 4672.714822 556.25340 526.89255 885.0788
ENSG00000008256 1040.59893 1029.061608 81.44520 95.72298 762.9443
ENSG00000119042 20.55504 11.568989 604.77394 598.47669 453.6426
ENSG00000138771 4.28230 8.098293 997.27050 1432.51513 68.2050
ENSG00000135047 9487.00772 8739.793049 280.72601 357.08832 607.5003
WM1366_72 WM3211_65 WM3211_71 WM793_67 WM793_73
ENSG00000136040 38.58537 33.78052 35.52636 37.26932 32.04139
ENSG00000123374 948.88514 867.48375 877.84480 844.44438 905.39826
ENSG00000008256 690.59939 906.66916 694.48296 823.84818 789.13377
ENSG00000119042 572.48091 408.06868 500.80702 488.42428 496.18388
ENSG00000138771 92.91987 48.64395 68.76069 58.84630 69.57560
ENSG00000135047 724.46002 664.80063 755.22157 1059.23337 632.58868
WM9_69 WM9_75 WM1158_68 WM1158_74
ENSG00000136040 49.8626 49.79399 65.05817 68.08078
ENSG00000123374 724.5856 689.64674 693.95380 852.86652
ENSG00000008256 340.8329 338.59912 267.68726 299.55544
ENSG00000119042 217.1232 243.99054 321.22471 257.46914
ENSG00000138771 134.4397 201.66565 168.06694 196.81535
ENSG00000135047 1449.8023 1175.13812 828.13628 1067.01152
head(sig1_norm_symbol)ensgene <chr> | NHEM76 <dbl> | NHEM77 <dbl> | Sbcl2_64 <dbl> | Sbcl2_70 <dbl> | WM1366_66 <dbl> | WM1366_72 <dbl> | WM3211_65 <dbl> | ||
|---|---|---|---|---|---|---|---|---|---|
| 1 | ENSG00000136040 | 4982.02798 | 4788.983166 | 49.38698 | 53.27192 | 25.3786 | 38.58537 | 33.78052 | |
| 2 | ENSG00000123374 | 4500.69744 | 4672.714822 | 556.25340 | 526.89255 | 885.0788 | 948.88514 | 867.48375 | |
| 3 | ENSG00000008256 | 1040.59893 | 1029.061608 | 81.44520 | 95.72298 | 762.9443 | 690.59939 | 906.66916 | |
| 4 | ENSG00000119042 | 20.55504 | 11.568989 | 604.77394 | 598.47669 | 453.6426 | 572.48091 | 408.06868 | |
| 5 | ENSG00000138771 | 4.28230 | 8.098293 | 997.27050 | 1432.51513 | 68.2050 | 92.91987 | 48.64395 | |
| 6 | ENSG00000135047 | 9487.00772 | 8739.793049 | 280.72601 | 357.08832 | 607.5003 | 724.46002 | 664.80063 |
#remove duplicates
sig1_norm_symbol <- unique(sig1_norm_symbol)
#get 1st 50
sig1_norm_symbol <- sig1_norm_symbol[1:70,]
#change symbol to be rownames instead of ensembl ids
row.names(sig1_norm_symbol) <- sig1_norm_symbol$symbol
#remove first and last column
sig1_norm_symbol <- sig1_norm_symbol[,c(-1,-16)]# Run pheatmap
pheatmap(sig1_norm_symbol[1:30,],
color = heat_colors,
cluster_rows =F,
show_rownames =T,
annotation = dplyr::select(colData, con),
scale ="row")pheatmap(sig1_norm_symbol[1:30,],
#color = heat_colors,
cluster_rows =F,
show_rownames =T,
annotation = dplyr::select(colData, con),cluster_cols=T, fontsize_row = 7,
scale ="row")
# Run pheatmap
pheatmap(sig1_norm_symbol[1:30,1:4],
#color = heat_colors,
cluster_rows =F,
cluster_cols = F,
show_rownames =T,
annotation = dplyr::select(colData, con),
fontsize_row = 7,
scale ="row")Instead of using the revel() function to make multiple comparisons,
we will subset the data to make the desired comparison clear.
we will try to find the significant genes during the development of cancer cells from the Radial growth phase to vertical growth phase.
??????????????????????????????
resultsNames(dds)[1] "Intercept" "con_metastasis_vs_normal"
[3] "con_radial_vs_normal" "con_vertical_vs_normal"
dds$con <- relevel(dds$con, ref = "radial")#run the analysis again
dds2 <- DESeq(dds)using pre-existing size factors
estimating dispersions
found already estimated dispersions, replacing these
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
resultsNames(dds2)[1] "Intercept" "con_normal_vs_radial"
[3] "con_metastasis_vs_radial" "con_vertical_vs_radial"
#mycols <- brewer.pal(11, "Set3")[1:length(unique(group1))]design(dds2)~con
head(counts(dds2)) #un-normalized NHEM76 NHEM77 Sbcl2_64 Sbcl2_70 WM1366_66 WM1366_72
ENSG00000223972 2 0 2 1 3 5
ENSG00000227232 7 9 10 9 5 10
ENSG00000241860 0 2 2 2 2 5
ENSG00000279457 4 9 19 13 2 2
ENSG00000228463 11 20 1 3 10 5
ENSG00000237094 3 2 5 5 7 25
WM3211_65 WM3211_71 WM793_67 WM793_73 WM9_69 WM9_75
ENSG00000223972 1 0 1 1 0 2
ENSG00000227232 7 4 4 5 8 5
ENSG00000241860 1 1 2 2 4 0
ENSG00000279457 2 6 5 4 12 2
ENSG00000228463 7 2 6 2 7 3
ENSG00000237094 6 11 1 4 16 6
WM1158_68 WM1158_74
ENSG00000223972 2 2
ENSG00000227232 13 5
ENSG00000241860 9 1
ENSG00000279457 7 3
ENSG00000228463 2 0
ENSG00000237094 20 15
plotDispEsts(dds2)
dds2_res <- results(dds2, contrast = c("con", "vertical", "radial"))
dds2_reslog2 fold change (MLE): con vertical vs radial
Wald test p-value: con vertical vs radial
DataFrame with 26574 rows and 6 columns
baseMean log2FoldChange lfcSE stat
<numeric> <numeric> <numeric> <numeric>
ENSG00000223972 1.78799 0.615814 1.563997 0.393744
ENSG00000227232 7.01446 -0.362312 0.606026 -0.597848
ENSG00000241860 1.98683 0.411896 1.202772 0.342455
ENSG00000279457 5.70124 -1.849810 0.658504 -2.809110
ENSG00000228463 5.49010 1.898228 1.143957 1.659353
... ... ... ... ...
ENSG00000198695 399.68416 -0.1841260 0.391999 -0.469710
ENSG00000210194 2.94970 -0.7581881 1.536414 -0.493479
ENSG00000198727 1411.29417 0.2099735 0.328341 0.639499
ENSG00000210195 9.21425 -1.2067161 0.712404 -1.693864
ENSG00000210196 58.66427 0.0646426 0.409127 0.158001
pvalue padj
<numeric> <numeric>
ENSG00000223972 0.69377001 0.8556240
ENSG00000227232 0.54994104 0.7675305
ENSG00000241860 0.73200836 0.8779981
ENSG00000279457 0.00496786 0.0361983
ENSG00000228463 0.09704464 0.2871696
... ... ...
ENSG00000198695 0.6385620 0.823365
ENSG00000210194 0.6216742 0.812410
ENSG00000198727 0.5224986 0.747588
ENSG00000210195 0.0902911 0.274519
ENSG00000210196 0.8744558 0.949233
plotMA(dds2, ylim=c(-20,20))The genes that are significantly DE are colored blue.
dds2_LFC <- lfcShrink(dds2,
coef = "con_vertical_vs_radial" ,
res = dds2_res)using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
plotMA(dds2_LFC, ylim=c(-20,20))
mcols(dds2_LFC)DataFrame with 5 rows and 2 columns
type description
<character> <character>
baseMean intermediate mean of normalized c..
log2FoldChange results log2 fold change (MA..
lfcSE results posterior SD: con ve..
pvalue results Wald test p-value: c..
padj results BH adjusted p-values
We will use the adjusted p-value for the multiple test correction.
The reason for this is that for every gene tested with alpha 0.05, there is a 5% chance that the gene is called as a DE when it is not, yielding false positives.
Therefore, multiple test correction is performed by DESeq2 using Benjamin-Hochberg, or BH-method, to adjust the p-values for multiple testing and control the proportion of false postives relative to true.
Using BH-method and alpha value of 0.05, if we had 1000 genes identified as DE, we would expect 5% of the DE genes to be false positives, or 50 genes. to reduce the number of genes tested, DESeq2 automatically filters out genes unlikely to be truly differential expressed prior to testing, such as genes with zero counts across all samples, genes with low mean values across all samples, and genes with extreme count outlines.
head(dds2_LFC,n =10 )log2 fold change (MAP): con vertical vs radial
Wald test p-value: con vertical vs radial
DataFrame with 10 rows and 5 columns
baseMean log2FoldChange lfcSE pvalue
<numeric> <numeric> <numeric> <numeric>
ENSG00000223972 1.78799 0.0870122 0.599502 0.693770012
ENSG00000227232 7.01446 -0.1934853 0.452214 0.549941038
ENSG00000241860 1.98683 0.0919239 0.570040 0.732008359
ENSG00000279457 5.70124 -1.4218850 0.683367 0.004967865
ENSG00000228463 5.49010 0.5511408 0.817487 0.097044645
ENSG00000237094 8.96601 0.4654540 0.647805 0.179601852
ENSG00000225972 5.58042 -0.3012066 0.548882 0.366442044
ENSG00000225630 38.66330 -0.0128316 0.454937 0.968476613
ENSG00000237973 141.47226 0.0609502 0.430149 0.852287098
ENSG00000229344 94.37507 2.0547940 0.933471 0.000812679
padj
<numeric>
ENSG00000223972 0.85562404
ENSG00000227232 0.76753046
ENSG00000241860 0.87799812
ENSG00000279457 0.03619831
ENSG00000228463 0.28716961
ENSG00000237094 0.41486071
ENSG00000225972 0.62133417
ENSG00000225630 0.99138372
ENSG00000237973 0.93853307
ENSG00000229344 0.00892282
We can see the filtered genes in the results table represented by an NA in the p-adjusted column.
summary(dds2_LFC)
out of 26574 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 2421, 9.1%
LFC < 0 (down) : 1902, 7.2%
outliers [1] : 7, 0.026%
low counts [2] : 5152, 19%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Over 4000 genes are DE which is the sum of log fold changes less than zero and greater than zero.
summary(dds2_LFC, alpha = 0.05)
out of 26574 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 1903, 7.2%
LFC < 0 (down) : 1388, 5.2%
outliers [1] : 7, 0.026%
low counts [2] : 5152, 19%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
#Rerun results function, Using 1.25 foldchange threshold which is 0.32 in the log2 scale
#remove lfcThreshold use lfc = 1??????
dds2_res <- results(dds2,
contrast = c("con", "vertical", "radial"),
alpha = 0.05,
#lfcThreshold = 0.32
) #Reshrink the foldchanges
dds2_LFC <- lfcShrink(dds2,
coef = "con_vertical_vs_radial" ,
res = dds2_res)using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
We will use the “annotables” library to annotate the Ensembl genes.
# Turn results table into a data frame
dds2_res_df <- data.frame(dds2_LFC)
head (dds2_res_df)baseMean <dbl> | log2FoldChange <dbl> | lfcSE <dbl> | pvalue <dbl> | padj <dbl> | |
|---|---|---|---|---|---|
| ENSG00000223972 | 1.787989 | 0.08701221 | 0.5995023 | 0.693770012 | NA |
| ENSG00000227232 | 7.014458 | -0.19348532 | 0.4522144 | 0.549941038 | 0.76222194 |
| ENSG00000241860 | 1.986826 | 0.09192389 | 0.5700397 | 0.732008359 | NA |
| ENSG00000279457 | 5.701238 | -1.42188500 | 0.6833669 | 0.004967865 | 0.03459684 |
| ENSG00000228463 | 5.490095 | 0.55114076 | 0.8174869 | 0.097044645 | 0.27920595 |
| ENSG00000237094 | 8.966015 | 0.46545397 | 0.6478051 | 0.179601852 | 0.40664269 |
head(grcm38)ensgene <chr> | entrez <int> | symbol <chr> | chr <chr> | start <int> | end <int> | strand <int> | biotype <chr> | |
|---|---|---|---|---|---|---|---|---|
| ENSMUSG00000000001 | 14679 | Gnai3 | 3 | 108014596 | 108053462 | -1 | protein_coding | |
| ENSMUSG00000000003 | 54192 | Pbsn | X | 76881507 | 76897229 | -1 | protein_coding | |
| ENSMUSG00000000028 | 12544 | Cdc45 | 16 | 18599197 | 18630737 | -1 | protein_coding | |
| ENSMUSG00000000031 | NA | H19 | 7 | 142129266 | 142131880 | -1 | lncRNA | |
| ENSMUSG00000000037 | 107815 | Scml2 | X | 159865521 | 160041209 | 1 | protein_coding | |
| ENSMUSG00000000049 | 11818 | Apoh | 11 | 108234180 | 108305222 | 1 | protein_coding |
dds2_res_anno <- rownames_to_column(dds2_res_df, var = "ensgene")
dds2_res_anno <- left_join(x = dds2_res_anno,
y = grch38[, c("ensgene", "symbol", "description")],
by = c("ensgene"))#View(dds2_res_anno)dds2_sig <- subset(dds2_res_anno, padj < 0.05)
#### ordering the genes by p-adjusted values
dds2_sig <- dds2_sig %>% arrange(padj)
head(dds2_sig)ensgene <chr> | baseMean <dbl> | log2FoldChange <dbl> | lfcSE <dbl> | pvalue <dbl> | padj <dbl> | symbol <chr> | ||
|---|---|---|---|---|---|---|---|---|
| 1 | ENSG00000164741 | 938.53714 | 4.329980 | 0.2278739 | 5.834329e-82 | 1.189270e-77 | DLC1 | |
| 2 | ENSG00000008256 | 582.94152 | 3.112443 | 0.1853774 | 1.561472e-64 | 1.591452e-60 | CYTH3 | |
| 3 | ENSG00000124575 | 1932.32137 | -10.011641 | 0.6741329 | 5.887644e-50 | 4.000458e-46 | H1-3 | |
| 4 | ENSG00000138771 | 253.57894 | -4.116167 | 0.2819125 | 6.602015e-49 | 3.364387e-45 | SHROOM3 | |
| 5 | ENSG00000162576 | 73.99471 | -6.881397 | 0.4940942 | 9.545988e-45 | 3.891708e-41 | MXRA8 | |
| 6 | ENSG00000206538 | 447.01936 | 4.679596 | 0.3411158 | 2.540234e-44 | 8.630023e-41 | VGLL3 |
head(dds2_sig)ensgene <chr> | baseMean <dbl> | log2FoldChange <dbl> | lfcSE <dbl> | pvalue <dbl> | padj <dbl> | symbol <chr> | ||
|---|---|---|---|---|---|---|---|---|
| 1 | ENSG00000164741 | 938.53714 | 4.329980 | 0.2278739 | 5.834329e-82 | 1.189270e-77 | DLC1 | |
| 2 | ENSG00000008256 | 582.94152 | 3.112443 | 0.1853774 | 1.561472e-64 | 1.591452e-60 | CYTH3 | |
| 3 | ENSG00000124575 | 1932.32137 | -10.011641 | 0.6741329 | 5.887644e-50 | 4.000458e-46 | H1-3 | |
| 4 | ENSG00000138771 | 253.57894 | -4.116167 | 0.2819125 | 6.602015e-49 | 3.364387e-45 | SHROOM3 | |
| 5 | ENSG00000162576 | 73.99471 | -6.881397 | 0.4940942 | 9.545988e-45 | 3.891708e-41 | MXRA8 | |
| 6 | ENSG00000206538 | 447.01936 | 4.679596 | 0.3411158 | 2.540234e-44 | 8.630023e-41 | VGLL3 |
dds2_first20 <- dds2_sig[1:20,]
#View(dds2_first20)
write.csv(dds2_first20, "dds2_first20.csv", row.names = F)#View(dds2_sig)# Subset normalized counts to significant genes
sig_norm_counts2 <- norm_counts[dds2_sig$ensgene,]
# Choose a color palette from RColorBrewerlibrary(RColorBrewer)
heat_colors <- brewer.pal(6,"YlOrRd")
display.brewer.all()
# Run pheatmap
pheatmap(sig_norm_counts2,
color = heat_colors,
cluster_rows =T,
show_rownames =F,
annotation = dplyr::select(colData, con),
scale ="row")
top2_20 <- data.frame(sig_norm_counts2)[1:20,] %>% rownames_to_column(var ="symbol")
top2_20 <- gather(top2_20, key ="samplename", value ="normalized_counts",4:9)top2_20 <- inner_join(top2_20, rownames_to_column(colData, var ="samplename"), by ="samplename")ggplot(top2_20)+
geom_point(aes(x = symbol, y = normalized_counts, color = con))+ scale_y_log10()+
xlab("Genes")+
ylab("Normalized Counts")+
ggtitle("Top 20 Significant DE Genes")+
theme_bw()+
theme(axis.text.x = element_text(angle =45, hjust =1))+
theme(plot.title = element_text(hjust =0.5))Warning: Transformation introduced infinite values in continuous y-axis
## Subset normalized counts to significant genes
sig2_norm_symbol <- norm_counts[dds2_sig$ensgene,]
#apply this to norm counts
sig2_norm_symbol <- rownames_to_column(as.data.frame(sig2_norm_symbol), var = "ensgene")
#pick symbol from gr38
sig2_norm_symbol <- left_join(x = sig2_norm_symbol,
y = grch38[ , c("ensgene", "symbol")],
by = c("ensgene"))
#remove duplicates
sig2_norm_symbol <- unique(sig2_norm_symbol)head(sig2_norm_symbol)ensgene <chr> | NHEM76 <dbl> | NHEM77 <dbl> | Sbcl2_64 <dbl> | Sbcl2_70 <dbl> | WM1366_66 <dbl> | WM1366_72 <dbl> | ||
|---|---|---|---|---|---|---|---|---|
| 1 | ENSG00000164741 | 276.63659 | 257.9884638 | 61.51712 | 80.74025 | 1568.714978 | 1730.829490 | |
| 2 | ENSG00000008256 | 1040.59893 | 1029.0616080 | 81.44520 | 95.72298 | 762.944292 | 690.599392 | |
| 3 | ENSG00000124575 | 3493.50045 | 3077.3511829 | 3734.34912 | 3652.45578 | 4.758488 | 3.149826 | |
| 4 | ENSG00000138771 | 4.28230 | 8.0982926 | 997.27050 | 1432.51513 | 68.204999 | 92.919873 | |
| 5 | ENSG00000162576 | 2.56938 | 1.1568989 | 432.35272 | 542.70765 | 3.172326 | 3.937283 | |
| 6 | ENSG00000206538 | 3.42584 | 0.5784495 | 36.39041 | 39.12156 | 1134.106379 | 1306.390411 |
head(top2_20_symbol)ensgene <chr> | NHEM76 <dbl> | NHEM77 <dbl> | WM793_67 <dbl> | WM793_73 <dbl> | WM9_69 <dbl> | WM9_75 <dbl> | ||
|---|---|---|---|---|---|---|---|---|
| 1 | ENSG00000164741 | 276.63659 | 257.9884638 | 1080.810348 | 1384.188246 | 791.489826 | 963.513674 | |
| 2 | ENSG00000008256 | 1040.59893 | 1029.0616080 | 823.848178 | 789.133775 | 340.832940 | 338.599120 | |
| 3 | ENSG00000124575 | 3493.50045 | 3077.3511829 | 4.903858 | 4.577342 | 1789.372933 | 1224.932112 | |
| 4 | ENSG00000138771 | 4.28230 | 8.0982926 | 58.846298 | 69.575600 | 134.439660 | 201.665653 | |
| 5 | ENSG00000162576 | 2.56938 | 1.1568989 | 1.961543 | 4.577342 | 4.418205 | 4.979399 | |
| 6 | ENSG00000206538 | 3.42584 | 0.5784495 | 940.560004 | 960.326369 | 32.189778 | 39.835191 |
top2_20_symbol <- data.frame(sig2_norm_symbol)[1:20,] %>% rownames_to_column(var ="ids")
#remove ids column
top2_20_symbol <- top2_20_symbol[,-1]
#key column will be a seperate column
top2_20_symbol <- gather(top2_20_symbol, key ="samplename", value ="normalized_counts",4:9)
top2_20_symbol <- inner_join(top2_20_symbol, rownames_to_column(colData, var ="samplename"), by ="samplename")ggplot(top2_20_symbol)+
geom_point(aes(x = symbol, y = normalized_counts, color = con))+ scale_y_log10()+
xlab("Genes")+
ylab("Normalized Counts")+
ggtitle("Top 20 Significant DE Genes")+
theme_bw()+
theme(axis.text.x = element_text(angle =45, hjust =1))+
theme(plot.title = element_text(hjust =0.5))Warning: Transformation introduced infinite values in continuous y-axis
#remove duplicates
sig2_norm_symbol <- unique(sig2_norm_symbol)
#get 1st 50
sig2_norm_symbol <- sig2_norm_symbol[1:70,]
#change symbol to be rownames instead of ensembl ids
row.names(sig2_norm_symbol) <- sig2_norm_symbol$symbol
#remove first and last column
sig2_norm_symbol <- sig2_norm_symbol[,c(-1,-16)]# Run pheatmap
pheatmap(sig2_norm_symbol[1:30,],
color = heat_colors,
cluster_rows =F,
show_rownames =T,
annotation = dplyr::select(colData, con),
scale ="row")pheatmap(sig2_norm_symbol[1:30,],
#color = heat_colors,
cluster_rows =F,
show_rownames =T,
annotation = dplyr::select(colData, con),cluster_cols=T, fontsize_row = 7,
scale ="row")
# Run pheatmap
pheatmap(sig2_norm_symbol[1:30,3:10],
#color = heat_colors,
cluster_rows =F,
cluster_cols = F,
show_rownames =T,
annotation = dplyr::select(colData, con),
fontsize_row = 7,
scale ="row")Now, we will move to the 3nd comparison and perform the analysis again. we will try to find the significant genes during the development from the Vertical growth phase to the Metastasis.
head(counts_data)NHEM76 <int> | NHEM77 <int> | Sbcl2_64 <int> | Sbcl2_70 <int> | WM1366_66 <int> | WM1366_72 <int> | WM3211_65 <int> | WM3211_71 <int> | WM793_67 <int> | ||
|---|---|---|---|---|---|---|---|---|---|---|
| ENSG00000223972 | 2 | 0 | 2 | 1 | 3 | 5 | 1 | 0 | 1 | |
| ENSG00000227232 | 7 | 9 | 10 | 9 | 5 | 10 | 7 | 4 | 4 | |
| ENSG00000241860 | 0 | 2 | 2 | 2 | 2 | 5 | 1 | 1 | 2 | |
| ENSG00000279457 | 4 | 9 | 19 | 13 | 2 | 2 | 2 | 6 | 5 | |
| ENSG00000228463 | 11 | 20 | 1 | 3 | 10 | 5 | 7 | 2 | 6 | |
| ENSG00000237094 | 3 | 2 | 5 | 5 | 7 | 25 | 6 | 11 | 1 |
resultsNames(dds)[1] "Intercept" "con_metastasis_vs_normal"
[3] "con_radial_vs_normal" "con_vertical_vs_normal"
dds$con <- relevel(dds$con, ref = "vertical")#run the analysis again
dds3 <- DESeq(dds)using pre-existing size factors
estimating dispersions
found already estimated dispersions, replacing these
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
resultsNames(dds3)[1] "Intercept" "con_radial_vs_vertical"
[3] "con_normal_vs_vertical" "con_metastasis_vs_vertical"
plotDispEsts(dds3)dds3_res <- results(dds3, contrast = c("con", "metastasis", "vertical"))
dds3_reslog2 fold change (MLE): con metastasis vs vertical
Wald test p-value: con metastasis vs vertical
DataFrame with 26574 rows and 6 columns
baseMean log2FoldChange lfcSE stat
<numeric> <numeric> <numeric> <numeric>
ENSG00000223972 1.78799 -0.0755989 1.217621 -0.0620874
ENSG00000227232 7.01446 0.2697658 0.506884 0.5322040
ENSG00000241860 1.98683 0.3424275 0.916919 0.3734545
ENSG00000279457 5.70124 0.5223273 0.608146 0.8588846
ENSG00000228463 5.49010 -1.0887743 0.840198 -1.2958546
... ... ... ... ...
ENSG00000198695 399.68416 1.376354 0.309246 4.45068
ENSG00000210194 2.94970 1.593152 1.220674 1.30514
ENSG00000198727 1411.29417 0.928511 0.259295 3.58091
ENSG00000210195 9.21425 1.086471 0.596912 1.82015
ENSG00000210196 58.66427 1.129573 0.318242 3.54941
pvalue padj
<numeric> <numeric>
ENSG00000223972 0.950493 0.979303
ENSG00000227232 0.594585 0.785553
ENSG00000241860 0.708810 0.858696
ENSG00000279457 0.390404 0.624851
ENSG00000228463 0.195026 0.413021
... ... ...
ENSG00000198695 8.55982e-06 0.000182397
ENSG00000210194 1.91845e-01 0.408995282
ENSG00000198727 3.42404e-04 0.003847627
ENSG00000210195 6.87358e-02 0.211005978
ENSG00000210196 3.86092e-04 0.004182180
log2 fold change : condition MET vs VGP, indicating that VGP is the base level of comparison.
plotMA(dds3, ylim=c(-20,20))The genes that are significantly DE are colored blue.
dds3_LFC <- lfcShrink(dds3,
coef = "con_metastasis_vs_vertical" ,
res = dds3_res)using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
plotMA(dds3_LFC, ylim=c(-20,20))Now we see more restricted log2fold change values, especially for lowly expressed genes.
mcols(dds3_LFC)DataFrame with 5 rows and 2 columns
type description
<character> <character>
baseMean intermediate mean of normalized c..
log2FoldChange results log2 fold change (MA..
lfcSE results posterior SD: con me..
pvalue results Wald test p-value: c..
padj results BH adjusted p-values
head(dds3_LFC,n =10 )log2 fold change (MAP): con metastasis vs vertical
Wald test p-value: con metastasis vs vertical
DataFrame with 10 rows and 5 columns
baseMean log2FoldChange lfcSE pvalue
<numeric> <numeric> <numeric> <numeric>
ENSG00000223972 1.78799 -0.0114374 0.504345 9.50493e-01
ENSG00000227232 7.01446 0.1474614 0.383395 5.94585e-01
ENSG00000241860 1.98683 0.0959220 0.476030 7.08810e-01
ENSG00000279457 5.70124 0.2493869 0.439049 3.90404e-01
ENSG00000228463 5.49010 -0.3594793 0.570045 1.95026e-01
ENSG00000237094 8.96601 0.2806600 0.451384 3.40359e-01
ENSG00000225972 5.58042 0.8239505 0.656229 3.94584e-02
ENSG00000225630 38.66330 1.7205444 0.516495 8.36129e-05
ENSG00000237973 141.47226 0.4001346 0.393890 1.77995e-01
ENSG00000229344 94.37507 -1.4352689 0.683092 2.22839e-03
padj
<numeric>
ENSG00000223972 0.97930298
ENSG00000227232 0.78555316
ENSG00000241860 0.85869610
ENSG00000279457 0.62485098
ENSG00000228463 0.41302143
ENSG00000237094 0.57723834
ENSG00000225972 0.14324477
ENSG00000225630 0.00123233
ENSG00000237973 0.39120854
ENSG00000229344 0.01687426
summary(dds3_LFC)
out of 26574 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 2822, 11%
LFC < 0 (down) : 2283, 8.6%
outliers [1] : 7, 0.026%
low counts [2] : 5152, 19%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Over 5000 genes are DE.
<br
summary(dds3_LFC, alpha = 0.05)
out of 26574 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 2193, 8.3%
LFC < 0 (down) : 1759, 6.6%
outliers [1] : 7, 0.026%
low counts [2] : 5152, 19%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
#Rerun results function, Using 1.25 foldchange threshold which is 0.32 in the log2 scale
dds3_res <- results(dds3,
contrast = c("con", "metastasis", "vertical"),
alpha = 0.05,
# lfcThreshold = 0.32
) #Reshrink the foldchanges
dds3_LFC <- lfcShrink(dds3,
coef = "con_metastasis_vs_vertical" ,
res = dds3_res)using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
# Turn results table into a data frame
dds3_res_df <- data.frame(dds3_LFC)
head (dds3_res_df)baseMean <dbl> | log2FoldChange <dbl> | lfcSE <dbl> | pvalue <dbl> | padj <dbl> | |
|---|---|---|---|---|---|
| ENSG00000223972 | 1.787989 | -0.01143740 | 0.5043450 | 0.9504932 | NA |
| ENSG00000227232 | 7.014458 | 0.14746142 | 0.3833946 | 0.5945847 | 0.7780712 |
| ENSG00000241860 | 1.986826 | 0.09592199 | 0.4760299 | 0.7088102 | NA |
| ENSG00000279457 | 5.701238 | 0.24938689 | 0.4390487 | 0.3904042 | 0.6150873 |
| ENSG00000228463 | 5.490095 | -0.35947934 | 0.5700447 | 0.1950256 | 0.4031438 |
| ENSG00000237094 | 8.966015 | 0.28065998 | 0.4513836 | 0.3403590 | 0.5670516 |
dds3_res_anno <- rownames_to_column(dds3_res_df, var = "ensgene")
dds3_res_anno <- left_join(x = dds3_res_anno,
y = grch38[, c("ensgene", "symbol", "description")],
by = c("ensgene"))View(dds3_res_anno)dds3_sig <- subset(dds3_res_anno, padj < 0.05)dds3_sig <- dds3_sig %>% arrange(padj)#View(dds3_sig)dds3_first20 <- dds3_sig[1:20,]
write.csv(dds3_first20, "dds3_first20.csv", row.names = F)# Subset normalized counts to significant genes
sig_norm_counts3 <- norm_counts[dds3_sig$ensgene,]
# Choose a color palette from RColorBrewerlibrary(RColorBrewer)
heat_colors <- brewer.pal(6,"YlOrRd")
display.brewer.all()
# Run pheatmap
pheatmap(sig_norm_counts3,
color = heat_colors,
cluster_rows =T,
show_rownames =F,
annotation = dplyr::select(colData, con),
scale ="row")##error NA/NAN vlauessig_norm_counts2[sig_norm_counts2==0] <- NA
# Delete the rows associated with NA.
sig_norm_counts2<-sig_norm_counts2[complete.cases(sig_norm_counts2),]# Run pheatmap
#pheatmap(sig_norm_counts2,
# color = heat_colors,
# cluster_rows =T,
# show_rownames =F,
#annotation = select(colData2, con2),
#scale ="row")top3_20 <- data.frame(sig_norm_counts3)[1:20,] %>% rownames_to_column(var ="ensgene")
top3_20 <- gather(top3_20, key ="samplename", value ="normalized_counts",9:14)top3_20 <- inner_join(top3_20, rownames_to_column(colData, var ="samplename"), by ="samplename")ggplot(top3_20)+
geom_point(aes(x = ensgene, y = normalized_counts, color = con))+ scale_y_log10()+
xlab("Genes")+
ylab("Normalized Counts")+
ggtitle("Top 20 Significant DE Genes")+
theme_bw()+
theme(axis.text.x = element_text(angle =45, hjust =1))+
theme(plot.title = element_text(hjust =0.5))Warning: Transformation introduced infinite values in continuous y-axis
## Subset normalized counts to significant genes
sig3_norm_symbol <- norm_counts[dds3_sig$ensgene,]
#apply this to norm counts
sig3_norm_symbol <- rownames_to_column(as.data.frame(sig3_norm_symbol), var = "ensgene")
#pick symbol from gr38
sig3_norm_symbol <- left_join(x = sig3_norm_symbol,
y = grch38[ , c("ensgene", "symbol")],
by = c("ensgene"))
#remove duplicates
sig3_norm_symbol <- unique(sig3_norm_symbol)head(sig3_norm_symbol)ensgene <chr> | NHEM76 <dbl> | NHEM77 <dbl> | Sbcl2_64 <dbl> | Sbcl2_70 <dbl> | WM1366_66 <dbl> | WM1366_72 <dbl> | ||
|---|---|---|---|---|---|---|---|---|
| 1 | ENSG00000124575 | 3493.50045 | 3077.3511829 | 3734.349125 | 3652.4557809 | 4.758488 | 3.1498262 | |
| 2 | ENSG00000206538 | 3.42584 | 0.5784495 | 36.390409 | 39.1215637 | 1134.106379 | 1306.3904114 | |
| 3 | ENSG00000164692 | 287.77057 | 241.2134292 | 2.599315 | 2.4971211 | 1.586163 | 0.7874565 | |
| 4 | ENSG00000148154 | 1949.30302 | 1786.8304143 | 1711.215666 | 2150.0212585 | 5538.880388 | 5631.1017674 | |
| 5 | ENSG00000124092 | 0.85646 | 2.8922474 | 0.000000 | 0.8323737 | 0.000000 | 1.5749131 | |
| 6 | ENSG00000111907 | 8.56460 | 12.1474389 | 436.684909 | 485.2738652 | 31.723255 | 33.8606315 |
NAtop3_20_symbol <- data.frame(sig3_norm_symbol)[1:20,] %>% rownames_to_column(var ="ids")
#remove ids column
top3_20_symbol <- top3_20_symbol[,-1]
#key column will be a seperate column
top3_20_symbol <- gather(top3_20_symbol, key ="samplename", value ="normalized_counts",10:14)
top3_20_symbol <- inner_join(top3_20_symbol, rownames_to_column(colData, var ="samplename"), by ="samplename")ggplot(top3_20_symbol)+
geom_point(aes(x = symbol, y = normalized_counts, color = con))+ scale_y_log10()+
xlab("Genes")+
ylab("Normalized Counts")+
ggtitle("Top 20 Significant DE Genes")+
theme_bw()+
theme(axis.text.x = element_text(angle =45, hjust =1))+
theme(plot.title = element_text(hjust =0.5))Warning: Transformation introduced infinite values in continuous y-axis
#remove duplicates
sig3_norm_symbol <- unique(sig3_norm_symbol)
#get 1st 50
sig3_norm_symbol <- sig3_norm_symbol[1:70,]
#change symbol to be rownames instead of ensembl ids
row.names(sig3_norm_symbol) <- sig3_norm_symbol$symbol
#remove first and last column
sig3_norm_symbol <- sig3_norm_symbol[,c(-1,-16)]# Run pheatmap
pheatmap(sig3_norm_symbol[1:30,],
color = heat_colors,
cluster_rows =F,
show_rownames =T,
annotation = dplyr::select(colData, con),
scale ="row")pheatmap(sig3_norm_symbol[1:30,],
#color = heat_colors,
cluster_rows =F,
show_rownames =T,
annotation = dplyr::select(colData, con),cluster_cols=T, fontsize_row = 7,
scale ="row")
# Run pheatmap
pheatmap(sig3_norm_symbol[1:30,5:14],
#color = heat_colors,
cluster_rows =F,
cluster_cols = F,
show_rownames =T,
annotation = dplyr::select(colData, con),
fontsize_row = 7,
scale ="row")Now, we have three tables of DE genes. * The first one represents the DE genes from normal to RGP. * The second one represents the DE genes from RGP to VGP. * The third one represents the DE genes from VGP to MET.
#normal vs RGP
head(dds1_sig) ensgene <chr> | baseMean <dbl> | log2FoldChange <dbl> | lfcSE <dbl> | pvalue <dbl> | padj <dbl> | symbol <chr> | ||
|---|---|---|---|---|---|---|---|---|
| 1 | ENSG00000136040 | 736.3605 | -6.548698 | 0.2556398 | 6.743132e-146 | 1.444042e-141 | PLXNC1 | |
| 2 | ENSG00000123374 | 1324.7676 | -3.072022 | 0.1386348 | 8.838296e-110 | 9.463605e-106 | CDK2 | |
| 3 | ENSG00000008256 | 582.9415 | -3.522128 | 0.2152231 | 3.488942e-61 | 2.490523e-57 | CYTH3 | |
| 4 | ENSG00000119042 | 371.0564 | 5.238048 | 0.3271351 | 4.032854e-59 | 2.159089e-55 | SATB2 | |
| 5 | ENSG00000138771 | 253.5789 | 7.500462 | 0.4701808 | 1.917069e-57 | 8.210805e-54 | SHROOM3 | |
| 6 | ENSG00000135047 | 1987.7506 | -4.793100 | 0.3284617 | 2.174088e-49 | 7.759684e-46 | CTSL |
#RGP vs VGP
head(dds2_sig) ensgene <chr> | baseMean <dbl> | log2FoldChange <dbl> | lfcSE <dbl> | pvalue <dbl> | padj <dbl> | symbol <chr> | ||
|---|---|---|---|---|---|---|---|---|
| 1 | ENSG00000164741 | 938.53714 | 4.329980 | 0.2278739 | 5.834329e-82 | 1.189270e-77 | DLC1 | |
| 2 | ENSG00000008256 | 582.94152 | 3.112443 | 0.1853774 | 1.561472e-64 | 1.591452e-60 | CYTH3 | |
| 3 | ENSG00000124575 | 1932.32137 | -10.011641 | 0.6741329 | 5.887644e-50 | 4.000458e-46 | H1-3 | |
| 4 | ENSG00000138771 | 253.57894 | -4.116167 | 0.2819125 | 6.602015e-49 | 3.364387e-45 | SHROOM3 | |
| 5 | ENSG00000162576 | 73.99471 | -6.881397 | 0.4940942 | 9.545988e-45 | 3.891708e-41 | MXRA8 | |
| 6 | ENSG00000206538 | 447.01936 | 4.679596 | 0.3411158 | 2.540234e-44 | 8.630023e-41 | VGLL3 |
#VGP vs MET
head(dds3_sig) ensgene <chr> | baseMean <dbl> | log2FoldChange <dbl> | lfcSE <dbl> | pvalue <dbl> | padj <dbl> | symbol <chr> | ||
|---|---|---|---|---|---|---|---|---|
| 1 | ENSG00000124575 | 1932.3214 | 9.858460 | 0.5740376 | 1.178796e-66 | 2.402857e-62 | H1-3 | |
| 2 | ENSG00000206538 | 447.0194 | -4.268533 | 0.2631844 | 1.909101e-60 | 1.945756e-56 | VGLL3 | |
| 3 | ENSG00000164692 | 433.2373 | 10.495247 | 0.7121575 | 3.038157e-48 | 2.064327e-44 | COL1A2 | |
| 4 | ENSG00000148154 | 2917.7125 | -3.184533 | 0.2279413 | 9.542799e-46 | 4.863010e-42 | UGCG | |
| 5 | ENSG00000124092 | 168.2422 | 9.275250 | 0.6587017 | 9.930855e-44 | 4.048611e-40 | CTCFL | |
| 6 | ENSG00000111907 | 179.5958 | 3.327813 | 0.2458226 | 7.801240e-43 | 2.650341e-39 | TPD52L1 |
We will use the anti_join() from dplyr package to filter out the specific genes which is related only to the development of cells from normal to RGP. The idea is to subtract from both phases (“RGP vs VGP” table and “VGP vs MET”)
#subtract from "RGP vs VGP" table (dds2_sig)
dds1_1 <-anti_join(dds1_sig, dds2_sig, by= c("ensgene", "symbol"))
head(dds1_1)ensgene <chr> | baseMean <dbl> | log2FoldChange <dbl> | lfcSE <dbl> | pvalue <dbl> | padj <dbl> | symbol <chr> | ||
|---|---|---|---|---|---|---|---|---|
| 1 | ENSG00000136040 | 736.3605 | -6.548698 | 0.2556398 | 6.743132e-146 | 1.444042e-141 | PLXNC1 | |
| 2 | ENSG00000119042 | 371.0564 | 5.238048 | 0.3271351 | 4.032854e-59 | 2.159089e-55 | SATB2 | |
| 3 | ENSG00000162409 | 321.8638 | 7.370758 | 0.6159768 | 1.693040e-33 | 2.014248e-30 | PRKAA2 | |
| 4 | ENSG00000142949 | 900.4188 | 4.156074 | 0.3524792 | 2.516308e-33 | 2.836144e-30 | PTPRF | |
| 5 | ENSG00000142627 | 719.8698 | 5.323807 | 0.4599855 | 2.713355e-32 | 2.641204e-29 | EPHA2 | |
| 6 | ENSG00000039319 | 1890.7175 | -3.104663 | 0.2737239 | 8.162937e-31 | 6.723434e-28 | ZFYVE16 |
#subtract from "VGP vs MET" table (dds3_sig)
dds1_1 <-anti_join(dds1_1, dds3_sig, by= c("ensgene", "symbol"))
head(dds1_1)ensgene <chr> | baseMean <dbl> | log2FoldChange <dbl> | lfcSE <dbl> | pvalue <dbl> | padj <dbl> | symbol <chr> | ||
|---|---|---|---|---|---|---|---|---|
| 1 | ENSG00000162409 | 321.8638 | 7.370758 | 0.6159768 | 1.693040e-33 | 2.014248e-30 | PRKAA2 | |
| 2 | ENSG00000039319 | 1890.7175 | -3.104663 | 0.2737239 | 8.162937e-31 | 6.723434e-28 | ZFYVE16 | |
| 3 | ENSG00000161544 | 144.1902 | -6.492069 | 0.5725647 | 7.887861e-31 | 6.723434e-28 | CYGB | |
| 4 | ENSG00000104044 | 273.8170 | -10.357139 | 0.9557127 | 4.524479e-27 | 3.125540e-24 | OCA2 | |
| 5 | ENSG00000130775 | 101.3481 | -4.248885 | 0.4126370 | 5.927725e-26 | 3.733595e-23 | THEMIS2 | |
| 6 | ENSG00000105048 | 365.4464 | 5.073644 | 0.4986363 | 1.686641e-25 | 1.031983e-22 | TNNT1 |
So now we only have genes that are specific to Normal vs RGP and are not shared with other developmental phases.
#save the DE genes in a table
write.table(dds1_1,file = "DE_norm_RGP.txt", sep = "\t", row.names = FALSE)
genes1_1 <- as.factor(dds1_1$symbol)
write.table(genes1_1, "genes1_1.txt")export(genes1_1, "genes1_1.xlsx")The idea is to subtract from both phases (“normal vs RGP” and “VGP vs MET”)
#subtract from "normal vs RGP"
dds2_2 <-anti_join(dds2_sig, dds1_sig, by= c("ensgene", "symbol"))
View(dds2_2)#subtract from "VGP vs MET"
dds2_2 <-anti_join(dds2_2, dds3_sig, by= c("ensgene", "symbol"))
View(dds2_2)reduced from 897 Genes to 602.
#save the DE genes in a table
write.csv(dds2_2,file = "DE_RGP_VGP.csv", row.names = FALSE)
We will use the same approach
#subtract from "normal vs RGP(dds1_sig)"
dds3_3 <-anti_join(dds3_sig, dds1_sig, by= c("ensgene", "symbol"))
View(dds3_3)#subtract from "RGP vs VGP"
dds3_3 <-anti_join(dds3_3, dds2_sig, by= c("ensgene", "symbol"))
View(dds3_3)Genes are reduced from 1208 to 913 genes, which are specific only for this phase from vertical growth phase to Metastasis.
#save the DE genes in a table
write.table(dds3_3, file = "DE_VGP_MET.txt", sep = " ", col.names = NA)
GO? GSEQ?
venn.diagram(list("list C"=C, "list D"=D), fill = c("yellow","cyan"), cex = 1.5, filename="Lesson-06/Venn_diagram_genes_2.png")
venn.diagram(list(A = A, C = C, D = D), fill = c("yellow","red","cyan"), cex = 1.5,filename="Lesson-06/Venn_diagram_genes_3.png")
venn.diagram(list(A = A, B = B, C = C, D = D), fill = c("yellow","red","cyan","forestgreen"), cex = 1.5,filename="Lesson-06/Venn_diagram_genes_4.png")
#install.packages("VennDiagram")
library(VennDiagram)Loading required package: grid
Loading required package: futile.logger
#venn.diagram(list(A = dds1_omit, C = dds2_omit, D = dds3_omit), fill = c("yellow","red","cyan"), cex = 1.5,filename="Venn_diagram_genes_3.png")
which(is.na(dds1_sig)) [1] 30517 30892 31433 31472 31494 31935 32022 32480 32550 32717 32745
[12] 35252 35627 36168 36207 36229 36670 36757 37215 37285 37452 37480
sum(is.na(dds1_sig))[1] 22
delete na values
dds1_omit <- na.omit(dds1_sig)sum(is.na(dds1_omit))[1] 0
dds2_omit <- na.omit(dds2_sig)
dds3_omit <- na.omit(dds3_sig)
grid.newpage()
draw.triple.venn(area1 = 10,
area2 = 10,
area3 = 10,
n12 = 2,
n23 = 2,
n13 = 2,
n123 = 1,
fill = c("pink", "green", "orange"),
lty = "blank",
category = c("normal_Vs_RGP", "RGP_Vs_VGP", "VGP_Vs_MET"))(polygon[GRID.polygon.1081], polygon[GRID.polygon.1082], polygon[GRID.polygon.1083], polygon[GRID.polygon.1084], polygon[GRID.polygon.1085], polygon[GRID.polygon.1086], text[GRID.text.1087], text[GRID.text.1088], text[GRID.text.1089], text[GRID.text.1090], text[GRID.text.1091], text[GRID.text.1092], text[GRID.text.1093], text[GRID.text.1094], text[GRID.text.1095], text[GRID.text.1096])
head(dds1_omit)ensgene <chr> | baseMean <dbl> | log2FoldChange <dbl> | lfcSE <dbl> | pvalue <dbl> | padj <dbl> | symbol <chr> | ||
|---|---|---|---|---|---|---|---|---|
| 1 | ENSG00000136040 | 736.3605 | -6.548698 | 0.2556398 | 6.743132e-146 | 1.444042e-141 | PLXNC1 | |
| 2 | ENSG00000123374 | 1324.7676 | -3.072022 | 0.1386348 | 8.838296e-110 | 9.463605e-106 | CDK2 | |
| 3 | ENSG00000008256 | 582.9415 | -3.522128 | 0.2152231 | 3.488942e-61 | 2.490523e-57 | CYTH3 | |
| 4 | ENSG00000119042 | 371.0564 | 5.238048 | 0.3271351 | 4.032854e-59 | 2.159089e-55 | SATB2 | |
| 5 | ENSG00000138771 | 253.5789 | 7.500462 | 0.4701808 | 1.917069e-57 | 8.210805e-54 | SHROOM3 | |
| 6 | ENSG00000135047 | 1987.7506 | -4.793100 | 0.3284617 | 2.174088e-49 | 7.759684e-46 | CTSL |
dds1_venn <- dds1_omit$ensgene
class(dds1_venn)[1] "character"
dds2_venn <- dds2_omit$ensgene
class(dds2_venn)[1] "character"
dds3_venn <- dds3_omit$ensgene
venn.diagram(list("normal vs RGP" = dds1_venn, "RGP vs VGP" = dds2_venn, "VGP vs MET" = dds3_venn), fill = c("yellow","red","cyan"), cex = 1.5,filename="Venn_diagram_1.png")[1] 1
grid.draw(venn.plot)
venn.plot <- draw.pairwise.venn(length(dds1_venn),
length(dds2_venn),
# Calculate the intersection of the two sets
length( intersect(dds1_venn, dds2_venn) ),
category = c("Normal vs RGP", "RGP vs VGP"), scaled = F,
fill = c("light blue", "pink"), alpha = rep(0.5, 2),
cat.pos = c(0, 0)) ;
grid.draw(venn.plot);
grid.newpage();library(gplots)
Attaching package: ‘gplots’
The following object is masked from ‘package:IRanges’:
space
The following object is masked from ‘package:S4Vectors’:
space
The following object is masked from ‘package:stats’:
lowess
# Create a Venn-diagram given just the list of gene-names for both sets
#venn(list("IPSC Trisomic vs Disomic" = ipsc.degs,
# "NEURON Trisomic vs Disomic" = neur.degs), )library(gplots)
venn(list("noramal vs RGP" = dds1_venn,
"RGP vs VGP" = dds2_venn,
"VGP vs MET"= dds3_venn))venn(list("RGP vs VGP" = dds2_venn,
"VGP vs MET"= dds3_venn), )Now, we will use clusterProfiler to perform: * Over representation analysis * Gene Set Enrichment Analysis GSEA
library(org.Hs.eg.db)Loading required package: AnnotationDbi
Attaching package: ‘AnnotationDbi’
The following object is masked from ‘package:dplyr’:
select
library(DOSE)DOSE v3.20.1 For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
If you use DOSE in published research, please cite:
Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics 2015, 31(4):608-609
library(pathview)##############################################################################
Pathview is an open source software package distributed under GNU General
Public License version 3 (GPLv3). Details of GPLv3 is available at
http://www.gnu.org/licenses/gpl-3.0.html. Particullary, users are required to
formally cite the original Pathview paper (not just mention it) in publications
or products. For details, do citation("pathview") within R.
The pathview downloads and uses KEGG data. Non-academic uses may require a KEGG
license agreement (details at http://www.kegg.jp/kegg/legal.html).
##############################################################################
library(clusterProfiler)clusterProfiler v4.2.2 For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
If you use clusterProfiler in published research, please cite:
T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
Attaching package: ‘clusterProfiler’
The following object is masked from ‘package:AnnotationDbi’:
select
The following object is masked from ‘package:purrr’:
simplify
The following object is masked from ‘package:IRanges’:
slice
The following object is masked from ‘package:S4Vectors’:
rename
The following object is masked from ‘package:stats’:
filter
library(AnnotationHub)Loading required package: BiocFileCache
Loading required package: dbplyr
Attaching package: ‘dbplyr’
The following objects are masked from ‘package:dplyr’:
ident, sql
Attaching package: ‘AnnotationHub’
The following object is masked from ‘package:Biobase’:
cache
library(ensembldb)Loading required package: GenomicFeatures
Loading required package: AnnotationFilter
Attaching package: 'ensembldb'
The following object is masked from 'package:clusterProfiler':
filter
The following object is masked from 'package:dplyr':
filter
The following object is masked from 'package:openxlsx':
addFilter
The following object is masked from 'package:stats':
filter
library(enrichplot)
library(ggnewscale)
library(tidyverse)To perform the over-representation analysis, we need a list of background genes and a list of significant genes. For our background dataset we will use all genes tested for differential expression (all genes in our results table). For our significant gene list we will use genes with p-adjusted values less than 0.05 (we could include a fold change threshold too if we have many DE genes).
## Create background dataset for hypergeometric testing using all genes tested for significance in the results
allOE_genes <- as.character(dds1_res_anno$ensgene)
## Extract significant results
sigOE <- dplyr::filter(dds1_res_anno, padj < 0.05)
sigOE_genes <- as.character(sigOE$ensgene) #genes vectorNow we can perform the GO enrichment analysis and save the results:
## Run GO enrichment analysis
ego <- enrichGO(gene = sigOE_genes,
universe = allOE_genes,
keyType = "ENSEMBL",
OrgDb = org.Hs.eg.db,
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
readable = TRUE)
## Output results from GO analysis to a table
cluster_summary <- data.frame(ego)
write.csv(cluster_summary, "GO1_NorRad.csv")clusterProfiler has a variety of options for viewing the over-represented GO terms. We will explore the dotplot, enrichment plot, and the category netplot.
The dotplot shows the number of genes associated with the first 50 terms (size) and the p-adjusted values for these terms (color). This plot displays the top 50 genes by gene ratio (# genes related to GO term / total number of sig genes), not p-adjusted value.
dotplot(ego, showCategory=25, title = "GO Biological process enrichment",label_format = 55)
barplot(ego, showCategory = 15)head(ego2)ID <chr> | Description <chr> | GeneRatio <chr> | BgRatio <chr> | ||
|---|---|---|---|---|---|
| GO:0030335 | GO:0030335 | positive regulation of cell migration | 168/3609 | 464/14366 | |
| GO:0040017 | GO:0040017 | positive regulation of locomotion | 174/3609 | 491/14366 | |
| GO:2000147 | GO:2000147 | positive regulation of cell motility | 171/3609 | 481/14366 | |
| GO:0051272 | GO:0051272 | positive regulation of cellular component movement | 174/3609 | 492/14366 | |
| GO:0042330 | GO:0042330 | taxis | 166/3609 | 467/14366 | |
| GO:0001667 | GO:0001667 | ameboidal-type cell migration | 138/3609 | 375/14366 |
The next plot is the enrichment GO plot, which shows the relationship between the top 50 most significantly enriched GO terms (padj.), by grouping similar terms together. The color represents the p-values relative to the other displayed terms (brighter red is more significant) and the size of the terms represents the number of genes that are significant from our list.
a network with edges connecting overlapping gene sets. In this way, mutually overlapping gene sets are tend to cluster together, making it easy to identify functional module.
## Enrichmap clusters the 50 most significant (by padj) GO terms to visualize relationships between terms
#emapplot(ego, showCategory = 10)
##Error in has_pairsim(x) Please use pairwise_termsim function to deal with the results of enrichment analysis.x2 <- pairwise_termsim(ego)
emapplot(x2, showCategory = 10, label_format= 40, cex_category=.8)Finally, the category netplot shows the relationships between the genes associated with the top five most significant GO terms and the fold changes of the significant genes associated with these terms (color). The size of the GO terms reflects the pvalues of the terms, with the more significant terms being larger. This plot is particularly useful for hypothesis generation in identifying genes that may be important to several of the most affected processes.
OE_foldchanges <- sigOE$log2FoldChange
names(OE_foldchanges) <- sigOE$symbol## Cnetplot details the genes associated with one or more terms - by default gives the top 5 significant terms (by padj)
cnetplot(ego,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=3,
ggrepel.max.overlaps = Inf,
)Warning: ggrepel: 266 unlabeled data points (too many overlaps). Consider increasing max.overlaps
ego2 <- data.frame(ego)
View(ego2)If you are interested in significant processes that are not among the top five, you can subset your ego dataset to only display these processes:
# find index to subset
which(ego2$ID == "GO:0042438") #melanin biosynthetic processinteger(0)
which(ego2$ID == "GO:0048021") #regulation of melanin biosynthetic integer(0)
which(ego2$ID == "GO:0010631") #epithelial cell migration[1] 15
which(ego2$ID == "GO:0043473") #Pigmentation[1] 62
which(ego2$ID == "GO:0050673") #epithelial cell proliferation[1] 34
which(ego2$ID == "GO:0043588") #skin development[1] 33
## Subsetting the ego results without overwriting original `ego` variable
ego3 <- ego
ego3@result <- ego@result[c(140
,611,14,71,45),]
## Plotting terms of interest
cnetplot(ego3,
categorySize="pvalue",
foldChange=OE_foldchanges,
showCategory = 5,
vertex.label.font=6)Warning: ggrepel: 215 unlabeled data points (too many overlaps). Consider increasing max.overlaps
head(ego3)ID <chr> | Description <chr> | ||
|---|---|---|---|
| GO:0003007 | GO:0003007 | heart morphogenesis | |
| GO:0006470 | GO:0006470 | protein dephosphorylation | |
| GO:0048284 | GO:0048284 | organelle fusion | |
| GO:0090050 | GO:0090050 | positive regulation of cell migration involved in sprouting angiogenesis |
## convert gene ID to Symbol
edox <- setReadable(ego, 'org.Hs.eg.db', 'ENTREZID')
p1 <- cnetplot(edox, foldChange=OE_foldchanges)
## categorySize can be scaled by 'pvalue' or 'geneNum'
p2 <- cnetplot(edox, categorySize="pvalue", foldChange=OE_foldchanges)
p3 <- cnetplot(edox,showCategory = 3, foldChange=OE_foldchanges, circular = TRUE, colorEdge = TRUE)
cowplot::plot_grid(p1, p2, p3, ncol=3, labels=LETTERS[1:3], rel_widths=c(.8, .8, 1.2))Warning: ggrepel: 274 unlabeled data points (too many overlaps). Consider increasing max.overlaps
Warning: ggrepel: 274 unlabeled data points (too many overlaps). Consider increasing max.overlaps
Warning: ggrepel: 173 unlabeled data points (too many overlaps). Consider increasing max.overlaps
cowplot::plot_grid( p3, rel_widths=c(.8, .8, 1.2))Warning: ggrepel: 138 unlabeled data points (too many overlaps). Consider increasing max.overlaps
#convert norm_counts to df
norm_counts_df <- data.frame(norm_counts)norm_counts_GSEA <- norm_counts_df %>% mutate(desc= NA, .before="NHEM76")head(norm_counts_GSEA)desc <lgl> | NHEM76 <dbl> | NHEM77 <dbl> | Sbcl2_64 <dbl> | Sbcl2_70 <dbl> | WM1366_66 <dbl> | WM1366_72 <dbl> | WM3211_65 <dbl> | ||
|---|---|---|---|---|---|---|---|---|---|
| ENSG00000223972 | NA | 1.71292 | 0.000000 | 1.7328766 | 0.8323737 | 4.758488 | 3.937283 | 1.351221 | |
| ENSG00000227232 | NA | 5.99522 | 5.206045 | 8.6643831 | 7.4913633 | 7.930814 | 7.874565 | 9.458546 | |
| ENSG00000241860 | NA | 0.00000 | 1.156899 | 1.7328766 | 1.6647474 | 3.172326 | 3.937283 | 1.351221 | |
| ENSG00000279457 | NA | 3.42584 | 5.206045 | 16.4623279 | 10.8208581 | 3.172326 | 1.574913 | 2.702442 | |
| ENSG00000228463 | NA | 9.42106 | 11.568989 | 0.8664383 | 2.4971211 | 15.861628 | 3.937283 | 9.458546 | |
| ENSG00000237094 | NA | 2.56938 | 1.156899 | 4.3321916 | 4.1618685 | 11.103139 | 19.686414 | 8.107325 |
# give the genes coloumn a proper name
norm_counts_GSEA <- cbind(ID = rownames(norm_counts_GSEA), norm_counts_GSEA)
head(norm_counts_GSEA)ID <chr> | desc <lgl> | NHEM76 <dbl> | NHEM77 <dbl> | Sbcl2_64 <dbl> | Sbcl2_70 <dbl> | WM1366_66 <dbl> | WM1366_72 <dbl> | ||
|---|---|---|---|---|---|---|---|---|---|
| ENSG00000223972 | ENSG00000223972 | NA | 1.71292 | 0.000000 | 1.7328766 | 0.8323737 | 4.758488 | 3.937283 | |
| ENSG00000227232 | ENSG00000227232 | NA | 5.99522 | 5.206045 | 8.6643831 | 7.4913633 | 7.930814 | 7.874565 | |
| ENSG00000241860 | ENSG00000241860 | NA | 0.00000 | 1.156899 | 1.7328766 | 1.6647474 | 3.172326 | 3.937283 | |
| ENSG00000279457 | ENSG00000279457 | NA | 3.42584 | 5.206045 | 16.4623279 | 10.8208581 | 3.172326 | 1.574913 | |
| ENSG00000228463 | ENSG00000228463 | NA | 9.42106 | 11.568989 | 0.8664383 | 2.4971211 | 15.861628 | 3.937283 | |
| ENSG00000237094 | ENSG00000237094 | NA | 2.56938 | 1.156899 | 4.3321916 | 4.1618685 | 11.103139 | 19.686414 |
# remove original rownames
#rownames(norm_counts_GSEA ) <- NULLhead(norm_counts_GSEA)ID <chr> | desc <lgl> | NHEM76 <dbl> | NHEM77 <dbl> | Sbcl2_64 <dbl> | Sbcl2_70 <dbl> | WM1366_66 <dbl> | WM1366_72 <dbl> | ||
|---|---|---|---|---|---|---|---|---|---|
| ENSG00000223972 | ENSG00000223972 | NA | 1.71292 | 0.000000 | 1.7328766 | 0.8323737 | 4.758488 | 3.937283 | |
| ENSG00000227232 | ENSG00000227232 | NA | 5.99522 | 5.206045 | 8.6643831 | 7.4913633 | 7.930814 | 7.874565 | |
| ENSG00000241860 | ENSG00000241860 | NA | 0.00000 | 1.156899 | 1.7328766 | 1.6647474 | 3.172326 | 3.937283 | |
| ENSG00000279457 | ENSG00000279457 | NA | 3.42584 | 5.206045 | 16.4623279 | 10.8208581 | 3.172326 | 1.574913 | |
| ENSG00000228463 | ENSG00000228463 | NA | 9.42106 | 11.568989 | 0.8664383 | 2.4971211 | 15.861628 | 3.937283 | |
| ENSG00000237094 | ENSG00000237094 | NA | 2.56938 | 1.156899 | 4.3321916 | 4.1618685 | 11.103139 | 19.686414 |
#save the normalized counts in a table ready for GSEA Java based program
write.table(norm_counts_GSEA, file = "GSEA/Normal2_GSEA.txt", sep = "\t", row.names = F)
#write.table(norm_counts, file = "GSEA/norm_counts.tsv", sep= "\t", col.names = NA)
export(norm_counts, rowNames = T, "GSEA/norm_counts.xlsx")#convert norm_counts to df
norm_counts_df <- data.frame(norm_counts)
normal_Radial_GSEA <- norm_counts_df %>% dplyr::select(1:4)
head(normal_Radial_GSEA)NHEM76 <dbl> | NHEM77 <dbl> | Sbcl2_64 <dbl> | Sbcl2_70 <dbl> | |
|---|---|---|---|---|
| ENSG00000223972 | 1.71292 | 0.000000 | 1.7328766 | 0.8323737 |
| ENSG00000227232 | 5.99522 | 5.206045 | 8.6643831 | 7.4913633 |
| ENSG00000241860 | 0.00000 | 1.156899 | 1.7328766 | 1.6647474 |
| ENSG00000279457 | 3.42584 | 5.206045 | 16.4623279 | 10.8208581 |
| ENSG00000228463 | 9.42106 | 11.568989 | 0.8664383 | 2.4971211 |
| ENSG00000237094 | 2.56938 | 1.156899 | 4.3321916 | 4.1618685 |
normal_Radial_GSEA <- normal_Radial_GSEA %>% mutate(desc= NA, .before="NHEM76")head(normal_Radial_GSEA)desc <lgl> | NHEM76 <dbl> | NHEM77 <dbl> | Sbcl2_64 <dbl> | Sbcl2_70 <dbl> | |
|---|---|---|---|---|---|
| ENSG00000223972 | NA | 1.71292 | 0.000000 | 1.7328766 | 0.8323737 |
| ENSG00000227232 | NA | 5.99522 | 5.206045 | 8.6643831 | 7.4913633 |
| ENSG00000241860 | NA | 0.00000 | 1.156899 | 1.7328766 | 1.6647474 |
| ENSG00000279457 | NA | 3.42584 | 5.206045 | 16.4623279 | 10.8208581 |
| ENSG00000228463 | NA | 9.42106 | 11.568989 | 0.8664383 | 2.4971211 |
| ENSG00000237094 | NA | 2.56938 | 1.156899 | 4.3321916 | 4.1618685 |
# give the genes coloumn a proper name
normal_Radial_GSEA <- cbind(ID = rownames(normal_Radial_GSEA), normal_Radial_GSEA)
head(normal_Radial_GSEA)ID <chr> | desc <lgl> | NHEM76 <dbl> | NHEM77 <dbl> | Sbcl2_64 <dbl> | Sbcl2_70 <dbl> | |
|---|---|---|---|---|---|---|
| ENSG00000223972 | ENSG00000223972 | NA | 1.71292 | 0.000000 | 1.7328766 | 0.8323737 |
| ENSG00000227232 | ENSG00000227232 | NA | 5.99522 | 5.206045 | 8.6643831 | 7.4913633 |
| ENSG00000241860 | ENSG00000241860 | NA | 0.00000 | 1.156899 | 1.7328766 | 1.6647474 |
| ENSG00000279457 | ENSG00000279457 | NA | 3.42584 | 5.206045 | 16.4623279 | 10.8208581 |
| ENSG00000228463 | ENSG00000228463 | NA | 9.42106 | 11.568989 | 0.8664383 | 2.4971211 |
| ENSG00000237094 | ENSG00000237094 | NA | 2.56938 | 1.156899 | 4.3321916 | 4.1618685 |
# remove original rownames
rownames(normal_Radial_GSEA ) <- NULLhead(normal_Radial_GSEA)ID <chr> | desc <lgl> | NHEM76 <dbl> | NHEM77 <dbl> | Sbcl2_64 <dbl> | Sbcl2_70 <dbl> | |
|---|---|---|---|---|---|---|
| 1 | ENSG00000223972 | NA | 1.71292 | 0.000000 | 1.7328766 | 0.8323737 |
| 2 | ENSG00000227232 | NA | 5.99522 | 5.206045 | 8.6643831 | 7.4913633 |
| 3 | ENSG00000241860 | NA | 0.00000 | 1.156899 | 1.7328766 | 1.6647474 |
| 4 | ENSG00000279457 | NA | 3.42584 | 5.206045 | 16.4623279 | 10.8208581 |
| 5 | ENSG00000228463 | NA | 9.42106 | 11.568989 | 0.8664383 | 2.4971211 |
| 6 | ENSG00000237094 | NA | 2.56938 | 1.156899 | 4.3321916 | 4.1618685 |
#save the normalized NormalVSRadial counts in a table
write.table(normal_Radial_GSEA, file = "normal_Rad_GSEA.txt", sep = "\t", row.names = F)
export(normal_Radial_GSEA, "normal_Rad_GSEA.xlsx")class(normal_Radial_GSEA)[1] "data.frame"
head(normal_Radial_GSEA)ID <chr> | desc <lgl> | NHEM76 <dbl> | NHEM77 <dbl> | Sbcl2_64 <dbl> | Sbcl2_70 <dbl> | |
|---|---|---|---|---|---|---|
| 1 | ENSG00000223972 | NA | 1.71292 | 0.000000 | 1.7328766 | 0.8323737 |
| 2 | ENSG00000227232 | NA | 5.99522 | 5.206045 | 8.6643831 | 7.4913633 |
| 3 | ENSG00000241860 | NA | 0.00000 | 1.156899 | 1.7328766 | 1.6647474 |
| 4 | ENSG00000279457 | NA | 3.42584 | 5.206045 | 16.4623279 | 10.8208581 |
| 5 | ENSG00000228463 | NA | 9.42106 | 11.568989 | 0.8664383 | 2.4971211 |
| 6 | ENSG00000237094 | NA | 2.56938 | 1.156899 | 4.3321916 | 4.1618685 |
#DAVID1 <- read.table("DAVID/Norm_Rad/chart_AAD5D588820E1657803685183.tsv", sep ="\t", header = T)#head(DAVID1)#barplot
#ggplot(DAVID1[1:5], aes(x=Count, y = Term))
#+geom_bar()Steps toward doing gene set enrichment analysis (GSEA):
1- obtaining stats for ranking genes in your experiment, 2- creating a named vector out of the DESeq2 result 3- Obtaining a gene set from mysigbd 4- doing analysis
already we performed DESeq2 analysis and have statistics for working on it
dds1_GSEA<- results(dds,
contrast = c("con", "radial", "normal"),
alpha = 0.05)Error in cleanContrast(object, contrast, expanded = isExpanded, listValues = listValues, :
radial and normal should be levels of con such that con_radial_vs_vertical and con_normal_vs_vertical are contained in 'resultsNames(object)'
barplot(hm_ora_VGP_MET)Multiple comparison overtime #cancelled. how the genes behave overtime #cancelled. time dependancy #cancelled . time series #cancelled we have 7 gene sets GO term list based on genes vis of diff exp in each step
#writing #introduction #material and methods
ref bioc hashes with work illustration